.libPaths( c( .libPaths(), "/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library") )
.libPaths()
library(dplyr, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
library(stringr, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
library(data.table, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
library(ggbeeswarm, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
library(glue, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
library(ggforce, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
library(pec, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
library(patchwork, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
library(survival, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
library(survminer, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
library(IRdisplay, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
library(cowplot, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
# library(igraph)
library(ggpubr, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library") # load before survminer
library(ggrepel, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
library(kableExtra, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
source('/work/isabl/home/gaot/facets-ch/R/facets-ch-utils.R')
source('/work/isabl/home/gaot/ch_cnv/ch-cnv/notebooks/plot_forest.R')
setwd('/work/isabl/home/gaot/ch_cnv')
library("ggpubr", lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
library(dplyr, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
library(stringr, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
library(data.table, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
library(ggbeeswarm, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
library(glue, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
library(ggforce, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
library(pec, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
library(patchwork, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
library(survival, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
library(survminer, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
library(IRdisplay, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
library(cowplot, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
# library(igraph)
library(ggpubr, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library") # load before survminer
library(ggrepel, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
library(kableExtra, lib.loc="/work/isabl/home/sirenkom/R/R-3.6.1/R-3.6.1/library")
source('/work/isabl/home/gaot/facets-ch/R/facets-ch-utils.R')
source('/work/isabl/home/gaot/ch_cnv/ch-cnv/notebooks/plot_forest.R')
setwd('/work/isabl/home/gaot/ch_cnv')
getwd()
M_wide = fread('./data/sm_wide_jul13.tsv')
# segs = fread('./data/segs_aberrant_may6.tsv', sep = '\t')
segs = rbind(
fread('./data/segs_aberrant_may18.tsv', sep = '\t'),
fread('./data/segs_aberrant_nonparta.tsv', sep = '\t'),
fill = TRUE
)
segs_filtered_old = fread('./data/segs_aberrant_apr19.tsv', sep = '\t')
gene_bed = fread('./data/impact_genes.tsv') %>%
mutate(chrom = ifelse(chrom == 'X', 23, chrom)) %>%
mutate(chrom = ifelse(chrom == 'Y', 24, chrom)) %>%
mutate(chrom = as.integer(chrom))
chrom_lens = fread('/home/gaot/ref/chrom_lens.tsv') %>% filter(chrom != 24)
acen = fread('/home/gaot/ref/acen.tsv')
colnames(acen) = c('chrom', 'start', 'end', 'arm', 'type')
chrom_lens = fread('/home/gaot/ref/chrom_lens.tsv') %>% filter(chrom != 24)
chrom_levels = c(as.character(1:22), 'X')
acen = acen %>%
mutate(chrom = str_remove(chrom, 'chr')) %>%
filter(chrom != 'Y') %>%
mutate(
chrom = ifelse(chrom == 'X', 23, chrom),
chrom = as.integer(chrom)
) %>%
arrange(chrom) %>%
group_by(chrom) %>%
summarise(cen_start = min(start), cen_end = max(end)) %>%
ungroup()
# mCA positive patient EMR text search
emr_review = fread('./data/clinicalreview/cnv_patient_asm_review.tsv') %>% filter(exclude == 1)
M_wide = M_wide %>%
mutate(
DateofProcedure = as.Date(DateofProcedure, format = '%Y-%m-%d'),
date_birth = as.Date(date_birth, format = '%Y-%m-%d'),
date_death = as.Date(date_death, format = '%Y-%m-%d')
) %>%
mutate(age = as.numeric((DateofProcedure - date_birth)/365)) %>%
filter(!active_heme) %>%
filter(QC %in% c('Passed', '#N/A')) %>%
filter(age > 20) %>%
filter(!dmp_patient_id %in% emr_review$dmp_patient_id)
# clean up heme diagnosis
M_wide = M_wide %>%
mutate(heme_cat_billing = heme_cat) %>%
mutate(heme_cat = ifelse(is.na(heme_cat_review), heme_cat, heme_cat_review)) %>%
mutate(
manual_review = as.integer(!is.na(valid_diagnosis)),
valid_diagnosis = ifelse(is.na(valid_diagnosis), 0, valid_diagnosis)
) %>%
mutate(
heme_cat = ifelse(valid_diagnosis == 0 & manual_review, 'Invalid', heme_cat),
post_heme = ifelse(valid_diagnosis == 1 & manual_review, secondary_heme, 0)
) %>%
mutate(
post_cll = as.integer(heme_cat == 'CLL' & post_heme),
post_mpn = as.integer(heme_cat %in% c('PV', 'ET', 'MPN') & post_heme),
post_tmn = as.integer(heme_cat %in% c('AML', 'MDS') & post_heme),
post_cml = as.integer(heme_cat %in% c('CML') & post_heme)
) %>%
mutate(
post_anyblood = post_heme,
post_mn = as.integer(post_mpn | post_tmn),
post_leukemia = as.integer(post_cll | post_mpn | post_tmn | post_cml)
) %>%
mutate(heme_cat = ifelse(heme_cat %in% c('ET', 'PV'), 'MPN', heme_cat))
# recode hemecat
M_wide = M_wide %>%
mutate(heme_cat2 = heme_cat) %>%
mutate(
heme_cat2 = ifelse(heme_cat2 == 'LCH', NA, heme_cat2),
heme_cat2 = ifelse(heme_cat2 %in% c('ET', 'PV'), 'MPN', heme_cat2),
heme_cat2 = ifelse(heme_cat2 %in% c('ALCL', 'CTCL'), 'T-NHL', heme_cat2),
heme_cat2 = ifelse(heme_cat2 %in% c('DLBCL', 'FL', 'MZL', 'PCNSL', 'WM'), 'B-NHL', heme_cat2)
) %>%
mutate(heme_cat2 = factor(heme_cat2, c('MDS', 'MPN', 'AML', 'CML', 'CLL', 'B-NHL', 'T-NHL', 'MM')))
M_wide = M_wide %>% mutate(post_heme = ifelse(is.na(heme_cat2), 0, post_heme))
display(paste0(sum(M_wide$post_tmn), 'tMN, ', sum(M_wide$post_cll), 'CLL, ', sum(M_wide$post_mpn), 'MPN'))
# clean up survival data
M_wide = M_wide %>%
mutate(race_b = ifelse(Race == "WHITE",1,0)) %>%
rowwise() %>%
mutate(
date_lastfu = as.Date(date_lastfu, format = '%Y-%m-%d'),
# sometimes last contact date is not accurate
time_lastfu = max(as.integer(date_lastfu - DateofProcedure), 0),
time_tmn = ifelse(post_tmn == 1, seq_to_diag, time_lastfu),
time_mn = ifelse(post_mn == 1, seq_to_diag, time_lastfu),
time_cll = ifelse(post_cll == 1, seq_to_diag, time_lastfu),
time_anyblood = ifelse(post_anyblood == 1, seq_to_diag, time_lastfu),
time_leukemia = ifelse(post_leukemia == 1, seq_to_diag, time_lastfu)
) %>%
ungroup() %>%
mutate(
ch_sm = ch_all
) %>%
mutate(tumor_type = GeneralTumorType)
# clean up smoke
M_wide = M_wide %>%
mutate(smoke = ifelse(is.na(smoke_bin), 'missing', c('1' = 'yes', '0' = 'no')[as.character(smoke_bin)])) %>%
mutate(smoke = factor(smoke, c('no', 'yes', 'missing')))
therapy_detailed = fread('./data/therapy_chemoxrt.tsv')
therapy_detailed = therapy_detailed %>% filter(complete.cases(.))
M_wide = M_wide %>%
left_join(
therapy_detailed,
by = "dmp_patient_id"
) %>%
mutate(therapy_detailed = !is.na(XRT))
### merge with CNV calls
failed_samples = segs %>% filter(is.na(seg)) %>% pull(dmp_sample_id)
M_wide = M_wide %>% filter(!dmp_sample_id %in% failed_samples)
M_wide %>% count(therapy_detailed)
M_long = fread('./data/sm_long_jul13.tsv') %>%
filter(dmp_patient_id %in% M_wide$dmp_patient_id)
M_long = M_long %>%
rename(chrom = Chrom, start = Start) %>%
mutate(chrom = as.integer(ifelse(chrom == 'X', 23, chrom)))
M_long = M_long %>% distinct(dmp_sample_id, start, Ref, Alt, `.keep_all` = T)
M_long = M_long %>% mutate(
VariantClass2 = case_when(
VariantClass %in% c('Translation_Start_Site', "5'Flank", "3'Flank", "3'UTR", "5'UTR") ~ 'Regulatory',
VariantClass %in% c('Missense_Mutation', 'In_Frame_Del', 'In_Frame_Ins') ~ 'Missense',
VariantClass %in% c('Nonsense_Mutation', 'Nonstop_Mutation', 'Splice_Site', 'Splice_Region', 'Frame_Shift_Del', 'Frame_Shift_Ins') ~ 'Truncating',
T ~ VariantClass)
)
segs = segs %>%
filter(!is.na(seg)) %>%
inner_join(M_wide) %>%
mutate(active_heme = ifelse(is.na(active_heme), T, active_heme))
segs = segs %>% mutate(
z_x = ifelse(is.na(z_x), t_cnlr, z_x),
z_y = ifelse(is.na(z_y), t_valor, z_y),
p_x = ifelse(is.na(p_x), p_cnlr, p_x),
p_y = ifelse(is.na(p_y), p_valor, p_y)
)
segs_filtered = segs %>%
ungroup() %>%
filter((p_chisq < 1e-10 & p_y < 1e-3)) %>%
filter(type != 'err' & err < 0.06) %>%
# germline amplifications
filter(!(phi >= 0.8 & type %in% c('amp'))) %>%
filter(!(type == 'amp' & size < 24e6)) %>%
filter(cnlr <= 0.5) %>%
filter(!(type == 'loh' & size < 8e6)) %>%
filter(!(chrom == 23 & Gender == 'Male' & type == 'loh')) %>%
filter(!(chrom == 23 & type == 'amp')) %>%
# recurrent focal artifacts
filter(!(chrom == 2 & type == 'del' & start > 11e7 & end < 13e7)) %>%
filter(!(chrom == 11 & type == 'del' & start > 60e6 & end < 70e6)) %>%
filter(!(chrom == 7 & start > 40e6 & end < 70e6))
# circulating tumor DNA
segs_filtered = segs_filtered %>% filter(dmp_patient_id != 'P-0027364')
segs_filtered = segs_filtered %>%
left_join(chrom_lens %>% rename(chrom_length = length), by = 'chrom') %>%
mutate(prop_chrom = size / chrom_length) %>%
mutate(focality = ifelse(prop_chrom < 0.15, 'focal', 'broad')) %>%
mutate(cnv_label = paste0(chrom, c('amp' = '+', 'del' = '-', 'loh' = '=')[type])) %>%
mutate(type2 = factor(c('del' = 'DEL', 'loh' = 'CNLOH', 'amp' = 'AMP')[type], c('DEL', 'CNLOH', 'AMP')))
# annodate locality
segs_filtered = segs_filtered %>% left_join(acen, by = "chrom")
segs_filtered = segs_filtered %>%
rowwise() %>%
mutate(
p_length = cen_start,
q_length = chrom_length - cen_end,
p_size = max(min(end, cen_start) - max(start, 0), 0),
q_size = max(min(end, chrom_length) - max(start, cen_end), 0),
p_frac = p_size/p_length,
q_frac = q_size/q_length,
arm = case_when(
p_frac > 0.5 & q_frac > 0.5 ~ 'p,q',
p_frac > q_frac ~ 'p',
T ~ 'q'
),
focality = ifelse(max(p_frac, q_frac) < 0.1, 'focal', 'broad')
) %>%
ungroup()
n_samples = segs_filtered %>% group_by(dmp_patient_id, dmp_sample_id) %>% summarise(n_samples = length(unique(dmp_sample_id))) %>% pull(n_samples) %>% max
display(n_samples)
display(nrow(segs_filtered))
# gemrline calls
M_germline = fread('./data/germline_muts.tsv')
M_germline = M_germline %>% filter(dmp_patient_id %in% M_wide$dmp_patient_id)
M_germline_wide = M_germline %>%
mutate(ind_germline = 1) %>%
reshape2::dcast(
dmp_patient_id ~ Gene,
value.var = 'ind_germline',
fun.aggregate = max,
fill = 0
) %>%
setNames(c('dmp_patient_id', paste0(colnames(.)[-1], '_g'))) %>%
mutate(any_germline = 1)
M_wide = M_wide %>% left_join(
M_germline_wide,
by = 'dmp_patient_id'
) %>%
mutate_at(colnames(M_germline_wide), function(x){ifelse(is.na(x), 0, x)})
segs_filtered = segs_filtered %>%
left_join(M_germline_wide, by = "dmp_patient_id")
# merge with GM data
CNV = segs_filtered %>%
mutate(ch_cnv = 1) %>%
group_by(dmp_patient_id, chrom) %>%
summarise(ch_cnv = max(ch_cnv)) %>%
arrange(chrom) %>%
ungroup() %>%
mutate(chrom = paste0('chr', chrom)) %>%
mutate(chrom = factor(chrom, unique(chrom))) %>%
reshape2::dcast(
dmp_patient_id ~ chrom,
value.var = 'ch_cnv',
fill = 0
) %>%
mutate(ch_cnv = 1)
CNV_detailed = segs_filtered %>%
mutate(ch_cnv = 1) %>%
group_by(dmp_patient_id, chrom, type) %>%
summarise(ch_cnv = max(ch_cnv)) %>%
arrange(chrom) %>%
ungroup() %>%
mutate(chrom = paste0('chr', chrom)) %>%
mutate(chrom = factor(chrom, unique(chrom))) %>%
reshape2::dcast(
dmp_patient_id ~ chrom + type,
value.var = 'ch_cnv',
fill = 0
)
CNV_detailed_arm = segs_filtered %>%
mutate(ch_cnv = 1) %>%
group_by(dmp_patient_id, chrom, type, arm) %>%
summarise(ch_cnv = max(ch_cnv)) %>%
arrange(chrom) %>%
ungroup() %>%
mutate(chrom = paste0('chr', chrom)) %>%
mutate(chrom = factor(chrom, unique(chrom))) %>%
mutate(arm = str_remove(arm, ',')) %>%
reshape2::dcast(
dmp_patient_id ~ chrom + arm + type,
value.var = 'ch_cnv',
fill = 0
)
CNV_types = segs_filtered %>%
mutate(ch_cnv = 1) %>%
mutate(type = paste0('ch_', type)) %>%
group_by(dmp_patient_id, type) %>%
summarise(ch_cnv = max(ch_cnv)) %>%
ungroup() %>%
reshape2::dcast(
dmp_patient_id ~ type,
value.var = 'ch_cnv',
fill = 0
)
segs_filtered_auto = segs_filtered %>% filter(chrom != 23)
window = 5e6
max0 = function(x) {
if (length(x) == 0) {
return(0)
} else {
return(max(x))
}
}
MAXVAF_trans = M_long %>%
filter(ch_nonsilent == 1) %>%
rowwise() %>%
mutate(
locality = ifelse(
any(chrom == segs_filtered_auto$chrom & dmp_patient_id == segs_filtered_auto$dmp_patient_id &
start >= segs_filtered_auto$start - window & start <= segs_filtered_auto$end + window
),
'cis',
'trans'
)
) %>%
ungroup() %>%
filter(locality == 'trans') %>%
group_by(dmp_patient_id) %>%
summarise(
VAF_trans = max(VAF_N),
mutnum_trans = sum(ch_nonsilent),
mutnum_pd_trans = sum(ch_nonsilent & ch_pd),
VAF_pd_trans = max0(VAF_N[ch_pd == 1])
)
chroms = colnames(CNV)[-1]
chroms = chroms[chroms != 'ch_cnv']
genes = c('DNMT3A', 'TET2', 'ASXL1', 'JAK2', 'TP53', 'PPM1D', 'ATM', 'EZH2')
M_wide = M_wide %>%
left_join(CNV, by = "dmp_patient_id") %>%
left_join(CNV_detailed, by = "dmp_patient_id") %>%
left_join(CNV_types, by = "dmp_patient_id") %>%
left_join(CNV_detailed_arm, by = "dmp_patient_id") %>%
left_join(MAXVAF_trans, by = 'dmp_patient_id') %>%
left_join(
segs_filtered_auto %>%
group_by(dmp_patient_id) %>%
summarise(
phi_cnv = max(phi),
phi_cnv_auto = max(phi[chrom != 23]),
n_cnv = n(),
n_cnv_chrom = length(unique(chrom)),
ch_cnv_auto = 1,
.groups = 'drop'),
by = 'dmp_patient_id'
) %>%
mutate_at(colnames(CNV), function(x){ifelse(is.na(x), 0, x)}) %>%
mutate_at(colnames(CNV_detailed), function(x){ifelse(is.na(x), 0, x)}) %>%
mutate_at(colnames(CNV_detailed_arm), function(x){ifelse(is.na(x), 0, x)}) %>%
mutate_at(colnames(CNV_types), function(x){ifelse(is.na(x), 0, x)}) %>%
mutate_at(
c('phi_cnv', 'VAF_trans', 'VAF_pd_trans', 'n_cnv', 'n_cnv_chrom', 'ch_cnv_auto', 'mutnum_trans', 'mutnum_pd_trans', 'phi_cnv_auto', 'phi_cnv'),
function(x){ifelse(is.na(x), 0, x)}
)
M_long = M_long %>%
left_join(CNV, by = "dmp_patient_id") %>%
left_join(CNV_detailed, by = "dmp_patient_id") %>%
left_join(CNV_types, by = "dmp_patient_id") %>%
left_join(CNV_detailed_arm, by = "dmp_patient_id") %>%
mutate_at(colnames(CNV), function(x){ifelse(is.na(x), 0, x)}) %>%
mutate_at(colnames(CNV_detailed), function(x){ifelse(is.na(x), 0, x)}) %>%
mutate_at(colnames(CNV_types), function(x){ifelse(is.na(x), 0, x)}) %>%
mutate(ch_cnv_auto = as.integer(ch_cnv & (!chr23)))
M_wide = M_wide %>%
mutate(age_bin = cut(age, c(seq(20, 80, 10), Inf))) %>%
mutate(
age10 = age/10,
age10_sq = age10^2
)
M_wide = M_wide %>%
mutate(
Race = case_when(
Race == 'ASIAN-FAR EAST/INDIAN SUBCONT' ~ 'asian',
Race == 'BLACK OR AFRICAN AMERICAN' ~ 'black',
Race == 'WHITE' ~ 'white',
T ~ 'other'
)
)
# loss of Y
segs_y = fread('./data/y_segs.tsv', sep = '\t')
LOY = segs_y %>% filter(p_x < 1e-17 & cnlr < 0)
M_wide = M_wide %>%
mutate(loy = ifelse(Gender == 'Male', dmp_sample_id %in% LOY$dmp_sample_id, NA)) %>%
left_join(LOY %>% mutate(loy_logr = cnlr)) %>%
mutate(loy_dose = ifelse(loy == 1, -exp(loy_logr), 0)) %>%
mutate(loy_bin = as.character(ifelse(loy == 1, cut(loy_dose, 3), 0)))
# cbcs
cbcs = c("anc", "alc", "amc", "hgb", "mcv", "rdw", "plt", "wbc", "rbc")
M_wide = M_wide %>% mutate(rdw = rdw * mcv)
M_wide = M_wide %>%
mutate_at(cbcs, .funs = list(n = function(x){(x - mean(na.omit(x))) / sd(na.omit(x))}))
nrow(M_wide) %>% paste(' patients valid for analysis!')
cnv_pal = c('loh' = 'darkgreen', 'amp' = 'darkblue', 'del' = 'darkred', 'err' = 'gray')
cnv_pal2 = c('CNLOH' = 'darkgreen', 'AMP' = 'darkblue', 'DEL' = 'darkred')
sm_pal = c('Regulatory' = 'darkturquoise', 'Missense' = 'salmon', 'Truncating' = 'purple')
gene_structures = fread('./data/gene_structures.tsv', sep = '\t') %>% filter(class == 'protein_coding')
sm_col = '#FDCDAC'
cnv_col = '#B3E2CD'
cbc_labels = c('anc' = 'Neutrophil', 'alc' = 'Lymphocyte', 'amc' = 'Monocyte',
'hgb' = 'Hemoglobin', 'mcv' = 'Mean corpuscular volume',
'rdw' = 'Red cell distribution width', 'plt' = 'Platelets',
'wbc' = 'White blood cell', 'rbc' = 'Red blood cell')
do_plot = function(p, f, w, h) {
ggsave(filename = paste0('/work/isabl/home/sirenkom/teng_mCA/figures/', f), plot = p, width = w, height = h, device = 'pdf')
options(repr.plot.width = w, repr.plot.height = h, repr.plot.res = 300)
print(p)
}
# for getting gene structures
# gtf = fread('/work/isabl/ref/homo_sapiens/GRCh37d5/ensembl/75/Homo_sapiens.GRCh37.75.gtf', skip = 'gene_id')
# colnames(gtf) = c('chrom', 'class', 'structure_type', 'start', 'end', 'x', 'y', 'z', 'description')
# gene_structures = gtf %>% filter(str_detect(description, paste(paste0('"', gene_bed$Gene, '"'), collapse = '|')) & structure_type == 'exon')
# # export for sebatia
# M_wide_export = M_wide %>%
# select(one_of(c('dmp_patient_id', 'dmp_sample_id', 'age', 'Gender', 'ch_nonsilent', colnames(CNV), colnames(CNV_detailed), colnames(CNV_types))))
# M_wide_export %>% fwrite('./data/ch_cnvs_sebastia.tsv')
# M_wide %>% filter(ch_cnv == 1) %>% select(dmp_patient_id, nonparta) %>% fwrite('./data/clinicalreview/cnv_patient_review_jul12.tsv', sep = '\t')
M_long
# # sample list for Elli
# M_long %>% group_by(dmp_patient_id) %>%
# filter(any(Gene == 'TP53')) %>%
# left_join(
# M_wide %>% select(dmp_sample_id, post_heme)
# ) %>%
# select(Gene, chrom, start, Ref, Alt, AAchange, dmp_patient_id, dmp_sample_id, ch_cnv, post_heme, heme_cat) %>%
# fwrite('./data/clinicalreview/tp53_all_variants.tsv', sep = '\t')
source('./ch-cnv/notebooks/table_one.R')
M_wide = M_wide %>% mutate(months_followup = as.integer(date_lastfu - DateofProcedure)/(365/12))
options(warn = -1)
table1 = tab_response(M_wide, xs = c('total', 'age', 'Gender', 'Race', 'smoke', 'months_followup', cbcs), y = 'ch_cnv')
options(warn = 0)
kb = table1 %>%
mutate(overall = ifelse(variable %in% c('Gender', 'Race', 'smoke'), ' ', overall)) %>%
rename(Overall = overall) %>%
mutate(variable = ifelse(variable == 'smoke', 'Smoking history', variable)) %>%
mutate(variable = ifelse(variable %in% names(cbc_labels), cbc_labels[variable], variable)) %>%
mutate(variable = str_replace(variable, '-', ' ')) %>%
mutate(levels = ifelse(levels == 'total', ' ', levels)) %>%
mutate(variable = ifelse(variable == 'total', 'total subjects', variable)) %>%
mutate(levels = Hmisc::capitalize(levels)) %>%
mutate(variable = Hmisc::capitalize(variable)) %>%
rename(
'mCA-' = `0`, 'mCA+' = `1`,
Characteristics = variable,
' ' = levels
) %>%
kable(format = "html", booktabs = T) %>%
collapse_rows(columns = 1:2, row_group_label_position = 'stack') %>%
row_spec(0:nrow(table1), background = 'white', align = 'center', extra_css = "border-bottom: 1px solid") %>%
row_spec(0, background = 'gray', extra_css = "border-bottom: 2px solid") %>%
kable_styling(position = "center", full_width = F)
kb %>%
toString() %>%
display_html()
save_kable(kb, './figures/cohort.pdf')
# segs = P$dmp_sample_id %>%
# lapply(function(sid){
# outdir = '/work/isabl/home/gaot/ch_cnv/results'
# segfile = glue('{outdir}/{sid}_seg.tsv')
# if (file.exists(segfile)) {
# seg = fread(segfile)
# noisy = nrow(seg) > 35
# if (noisy) {
# data.frame()
# } else {
# seg %>% mutate(sample = sid) %>% filter(aberrant)
# }
# } else {
# data.frame()
# }
# }) %>% Reduce(rbind, .)
# segs = segs %>% left_join(P, by = c('sample' = 'dmp_sample_id'))
# outdir = '/work/isabl/home/gaot/ch_cnv/results_jul2'
# segs = M_wide$dmp_sample_id %>%
# lapply(function(sid){
# segfile = glue('{outdir}/{sid}_seg.tsv')
# if (file.exists(segfile)) {
# seg = fread(segfile)
# seg %>% mutate(dmp_sample_id = sid) %>% filter(aberrant)
# } else {
# data.frame(dmp_sample_id = c(sid)) %>% mutate(dmp_sample_id = as.character(dmp_sample_id))
# }
# }) %>% Reduce(bind_rows, .)
# print('done!')
# segs %>% fwrite('./data/segs_aberrant_jul2.tsv', sep = '\t')
# outdir = '/work/isabl/public/facets-ch/dmp_processed'
# segs = M_wide %>%
# filter(nonparta == 1) %>%
# pull(dmp_sample_id) %>%
# lapply(function(sid){
# segfile = glue('{outdir}/{sid}_seg.tsv')
# if (file.exists(segfile)) {
# seg = fread(segfile)
# seg %>% mutate(dmp_sample_id = sid) %>% filter(aberrant)
# } else {
# data.frame(dmp_sample_id = c(sid)) %>% mutate(dmp_sample_id = as.character(dmp_sample_id))
# }
# }) %>% Reduce(bind_rows, .)
# print('done!')
# # correct for phi and variant type
# segs[c('type', 'phi', 'err')] = t(mapply(variant_type, segs$cnlr_adj, segs$valor, segs$aberrant))
# segs %>% fwrite('./data/segs_aberrant_nonparta.tsv', sep = '\t')
segs_filtered %>% nrow %>% paste('unique events')
segs_filtered %>% count(auto = chrom != 23)
segs_filtered %>% pull(dmp_patient_id) %>% unique %>% length %>% paste('unique individuals with CNV')
segs_filtered %>%
filter(chrom != 23) %>%
count(type) %>% mutate(prop = signif(n/sum(n), 2))
segs_filtered %>%
filter(chrom != 23) %>%
count(dmp_patient_id) %>% count(n) %>% mutate(prop = signif(nn/sum(nn), 2))
cat('Cell fraction ranges:', segs_filtered$phi %>% range %>% signif(2) %>% paste(collapse = '-') %>% paste('\n'))
cat('median cell fraction:', segs_filtered$phi %>% median %>% signif(2) %>% paste('\n'))
cat('median cell fraction:', segs_filtered$phi %>% median %>% signif(2) %>% paste('\n'))
signif.num <- function(x, ns = FALSE, pch = TRUE) {
if (ns) {
symbols = c("***", "**", "*", ".", "ns")
} else {
symbols = c("***", "**", "*", ".", "")
}
if (pch) {
symbols = c(8, 8, 3, 20)
}
symnum(unlist(x), corr = FALSE, na = FALSE, legend = FALSE,
cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1),
symbols = symbols)
}
fit = glm(data = M_wide, formula = ch_cnv_auto ~ age + Gender + race_b, family = 'binomial')
D = M_wide %>%
group_by(tumor_type) %>%
summarise(
cnv_freq = sum(ch_cnv_auto)/n(),
cnv_count = sum(ch_cnv_auto),
sm_freq = sum(ch_nonsilent)/n(),
n_male = sum(Gender == 'Male'),
loy_freq = sum(loy[Gender == 'Male'])/n_male,
n = n(),
age_median = median(age),
cnv_freq_pred = sum(predict(fit, data.frame(age = age, Gender = Gender, race_b = race_b), type="response"))/n()
) %>%
mutate(
cnv_freq_lower = qbinom(p = 0.05, size = n, prob = cnv_freq)/n,
cnv_freq_upper = qbinom(p = 0.95, size = n, prob = cnv_freq)/n,
sm_freq_lower = qbinom(p = 0.05, size = n, prob = sm_freq)/n,
sm_freq_upper = qbinom(p = 0.95, size = n, prob = sm_freq)/n,
loy_freq_lower = qbinom(p = 0.05, size = n, prob = loy_freq)/n_male,
loy_freq_upper = qbinom(p = 0.95, size = n, prob = loy_freq)/n_male
) %>%
filter(n > 300) %>%
arrange(cnv_freq_pred) %>%
# mutate(tumor_type = paste0(tumor_type, '(', n, ')')) %>%
mutate(tumor_type = factor(tumor_type, unique(tumor_type)))
D = D %>%
rowwise() %>%
mutate(p = prop.test(x = cnv_count, n = n, p = cnv_freq_pred)$p.value) %>%
mutate(
q = p.adjust(p, method = 'BH'),
q.stars = gtools::stars.pval(q)
) %>%
ungroup()
coef = mean(D$sm_freq)/mean(D$cnv_freq)
p1 = D %>%
ggplot(
aes()
) +
theme_classic() +
geom_point(
inherit.aes = F,
aes(x = tumor_type,
y = cnv_freq_pred
),
color = 'royalblue',
pch = 21,
size = 3
) +
geom_pointrange(
inherit.aes = F,
aes(x = tumor_type,
y = cnv_freq,
ymin = cnv_freq_lower,
ymax = cnv_freq_upper,
# color = 'CNV-CH'
),
stat = 'identity',
pch = 5
) +
# geom_text(
# aes(label = q.stars, x = tumor_type, y = 0.03),
# # hjust = 1.5,
# # vjust = -0.2,
# size = 5
# ) +
theme(
axis.text.x = element_blank(),
plot.margin = unit(c(0,0,0,0), 'mm')
# axis.title.x = element_blank()
) +
xlab('') +
ylab('Proportion with mCA')
give.n <- function(x){
return(c(y = 100, label = length(x)))
}
p2 = M_wide %>% filter(tumor_type %in% D$tumor_type) %>%
mutate(tumor_type = factor(tumor_type, levels(D$tumor_type))) %>%
ggplot(aes(x = tumor_type, y = age)) +
theme_classic() +
geom_violin(draw_quantiles = c(0.5)) +
theme(
axis.text.x = element_text(angle = 30, hjust = 1, size = 7),
plot.margin = unit(c(0,0,0,0), 'mm')
) +
xlab('') +
stat_summary(fun.data = give.n, geom = "text", vjust = 0, size = 2) +
scale_y_continuous(expand = expansion(add = 10)) +
ylab('Age')
panel = p1/p2 + plot_layout(heights = c(2,1))
do_plot(panel, 'cancer_types.pdf', w = 5, h = 4)
table(M_wide$DNMT3A, M_wide$ch_cnv_auto)
table(M_wide$DNMT3A, M_wide$ch_cnv_auto) %>% fisher.test
options(repr.plot.width = 4, repr.plot.height = 2, repr.plot.res = 400)
sm_classes = M_long %>% pull(VariantClass2) %>% unique
colors = c(cnv_pal2, sm_pal)
heme_colors = c(' ' = 'white', 'CLL' = 'royalblue', 'MDS' = 'tomato3', 'MPN' = 'purple', 'Lymphoma' = 'yellowgreen')
colors = c(colors, heme_colors)
gene_set = c(fread('./data/gene_set.tsv')$Gene, 'PPM1D', 'SRSF2')
# basically the same
gene_set = M_long %>%
filter(ch_nonsilent == 1) %>%
filter(Gene %in% gene_set) %>%
filter(dmp_patient_id %in% segs_filtered$dmp_patient_id) %>%
count(Gene) %>% filter(n >= 2) %>% pull(Gene)
# gene_set = c('DNMT3A', 'TET2', 'JAK2', 'ASXL1', 'TP53', 'ATM', 'PPM1D', 'CHEK2',
# 'SRSF2', 'SF3B1', 'U2AF1', 'EZH2', 'MPL', 'NOTCH1', 'TERT', 'SUZ12', 'CBL')
chroms = segs_filtered %>% count(chrom) %>%
# filter(n >= 4) %>%
pull(chrom)
# max_rows = 15
max_rows = 22
plots = list()
for (this_chrom in chroms) {
chrom_label = ifelse(this_chrom == 23, 'X', this_chrom)
segs_auto = segs_filtered %>% filter(chrom == this_chrom)
M_co = rbind(
M_long %>%
filter(Gene %in% gene_set) %>%
filter(ch_nonsilent == 1) %>%
filter(dmp_sample_id %in% segs_auto$dmp_sample_id) %>%
mutate(
mutation = Gene,
type = VariantClass2,
arm = 'na',
focality = 'na'
) %>%
select(mutation, type, arm, focality, dmp_patient_id),
segs_auto %>%
mutate(
mutation = paste0(arm, type, focality)
) %>%
select(mutation, type = type2, arm, focality, dmp_patient_id)
) %>%
group_by(mutation) %>%
mutate(mutation_n = n()) %>%
ungroup()
D = M_co %>%
distinct(mutation, dmp_patient_id, `.keep_all` = T) %>%
group_by(mutation) %>% mutate(mutation_n = n()) %>% ungroup() %>%
arrange(type %in% sm_classes, desc(mutation_n)) %>%
mutate(mutation = factor(mutation, rev(unique(mutation)))) %>%
rowwise() %>%
mutate(mut_weight = 2^which(mutation == levels(mutation))) %>%
ungroup() %>%
group_by(dmp_patient_id) %>%
mutate(patient_weight = sum(mut_weight)) %>%
ungroup() %>%
arrange(desc(patient_weight), type) %>%
mutate(pid = as.integer(factor(dmp_patient_id, unique(dmp_patient_id))))
D = D %>% arrange(desc(mutation)) %>%
mutate(mutation = as.character(mutation)) %>%
mutate(mutation = str_remove_all(mutation, 'amp|del|loh|broad|focal')) %>%
mutate(mutation = factor(mutation, rev(unique(mutation))))
n_rows = length(levels(D$mutation))
n_cols = length(unique(D$pid))
D = D %>% mutate(mutation = factor(mutation, c(as.character(1:(max_rows - n_rows)), levels(mutation))))
nrows_sm = D %>% filter(type %in% sm_classes) %>% pull(mutation) %>% unique %>% length
p = ggplot(
D,
aes(x = pid, y = mutation, fill = type)
) +
geom_rect(
xmin = 0, xmax = n_cols + 0.5, ymin = max_rows - n_rows + nrows_sm + 0.5, ymax = max_rows + 0.5,
fill = 'whitesmoke',
) +
geom_segment(
aes(x = pid + 0.5, xend = pid + 0.5),
y = max_rows - n_rows + 0.5,
yend = max_rows + 0.5,
color = 'gray90',
size = 0.3
) +
geom_hline(yintercept = max_rows - n_rows + nrows_sm + 0.5, size = 0.3, color = 'gray90') +
# geom_hline(yintercept = max_rows + 0.5, size = 0.5, color = 'black') +
# geom_hline(yintercept = max_rows - n_rows + 0.5, size = 0.5, color = 'black') +
geom_segment(x = 0.5, xend = 0.5, y = max_rows - n_rows + 0.5, yend = max_rows + 0.5) +
geom_rect(
aes(
xmin = pid - 0.4,
xmax = pid + 0.4,
ymin = case_when(
focality == 'focal' ~ as.numeric(mutation) - 0.1,
arm == 'p' ~ as.numeric(mutation),
type %in% sm_classes ~ as.numeric(mutation) - 0.45,
T ~ as.numeric(mutation) - 0.45
),
ymax = case_when(
focality == 'focal' ~ as.numeric(mutation) + 0.1,
arm == 'p' ~ as.numeric(mutation) + 0.45,
arm == 'q' ~ as.numeric(mutation),
arm == 'p,q' ~ as.numeric(mutation) + 0.45,
type %in% sm_classes ~ as.numeric(mutation) + 0.45
)
),
size = 0, colour = "white"
) +
# centromeres
geom_point(
data = D %>% filter((!type %in% sm_classes) & focality != 'focal'),
aes(color = type),
size = 1,
show.legend = FALSE
) +
theme_classic() +
theme(
axis.text.x = element_blank(),
plot.margin = margin(t = 0, r = 1, b = 0, l = 0, unit = "mm"),
# panel.spacing = unit(1,'cm'),
axis.line = element_blank(),
axis.ticks = element_blank(),
legend.box = "horizontal",
plot.title = element_text(size = 10)
) +
guides(
fill = guide_legend(ncol = 2, title = 'Variant Types'),
color = FALSE
) +
scale_fill_manual(
values = colors
) +
scale_color_manual(
values = colors
) +
scale_x_discrete(
expand = expansion(add = 0.2)
) +
scale_y_discrete(
expand = expansion(add = 0.2),
labels = str_remove_all(levels(D$mutation), 'broad|focal|del|del|loh|amp|^\\d+$'),
drop = F
) +
ggtitle(chrom_label) +
ylab('') +
xlab('')
if (this_chrom != 14) {
p = p + theme(legend.position = 'none')
} else {
p = p + theme(legend.position = 'right')
}
# patient heme status
# D_heme = D %>% distinct(dmp_patient_id, pid) %>%
# left_join(
# M_wide %>% select(dmp_patient_id, heme_cat),
# by = 'dmp_patient_id'
# ) %>%
# mutate(
# heme_cat = ifelse(heme_cat %in% c('DLBCL', 'FL', 'MZL'), 'Lymphoma', heme_cat)
# ) %>% filter(heme_cat != '')
# if (nrow(D_heme) != 0) {
# p = p + geom_point(
# inherit.aes = F,
# data = D_heme,
# mapping = aes(x = pid, fill = heme_cat),
# y = max_rows - n_rows,
# pch = 24, size = 2,
# colour = "white"
# )
# }
plots[[this_chrom]] = p
}
plots = plots[lengths(plots) != 0]
plot_widths = segs_filtered %>%
distinct(chrom, dmp_patient_id) %>%
filter(chrom %in% chroms) %>% count(chrom) %>% pull(n)
panel = wrap_plots(plotlist = plots, widths = plot_widths, nrow = 1, guides = 'collect')
do_plot(panel, 'oncoplot.pdf', w = 40, h = 3.5)
18002/32442
M_wide %>% count(smoke)
# genes = c('DNMT3A', 'TET2', 'ASXL1', 'EZH2', 'TP53', 'ATM', 'CHEK2', 'PPM1D',
# 'SRSF2', 'SF3B1', 'U2AF1', 'MPL', 'JAK2', 'CBL', 'RUNX1', 'NOTCH1', 'MYD88', 'TERT', 'SUZ12', 'BRCA2', 'KIT', 'IKZF1')
genes = c('DNMT3A', 'TET2', 'ASXL1', 'EZH2', 'TP53', 'ATM', 'CHEK2', 'PPM1D',
'SRSF2', 'SF3B1', 'U2AF1', 'MPL', 'JAK2', 'TERT')
res = data.frame()
# don't test for all if chrom has predominately one kind of change
single_type_chroms = segs_filtered %>%
count(chrom, type) %>%
group_by(chrom) %>%
mutate(total = sum(n)) %>%
mutate(prop = n/total) %>%
filter(prop > 0.7) %>% pull(chrom) %>%
paste0('chr', .)
chroms_detailed = colnames(CNV_detailed)[-1][(CNV_detailed[,-1] %>% colSums) > 3]
chroms_broad = colnames(CNV)[-1][(CNV[,-1] %>% colSums) > 3]
chroms_broad = chroms_broad[chroms_broad != 'ch_cnv']
chroms_broad = chroms_broad[!(chroms_broad %in% single_type_chroms)]
chroms = c(chroms_broad, chroms_detailed)
# chroms = chroms[str_extract(chroms, '\\d+') != 23]
# chroms = c(chroms, 'loy')
ntest = 0
for (gene in genes) {
display(gene)
gene_chrom = gene_bed %>% filter(Gene == gene) %>% pull(chrom)
for (chrom in chroms) {
# chroms[str_extract(chroms, '\\d+') != gene_chrom]
ntest = ntest + 1
# D = M_wide %>% filter(!cytopenia)
D = M_wide
counts = table(D[[gene]], D[[chrom]])
if (ncol(counts) != 1) {
if (chrom == 'chr23_del') {
counts = table(D[D$Gender == 'Female',][[gene]], D[D$Gender == 'Female',][[chrom]])
}
if (chrom == 'loy') {
counts = table(D[D$Gender == 'Male',][[gene]], D[D$Gender == 'Male',][[chrom]])
}
co = counts[2,2]
if (co >= 2) {
test = fisher.test(counts)
res = rbind(res,
data.frame(
gene = gene,
chrom = chrom,
p = test$p.value,
OR = test$estimate,
co = co
))
}
}
}
}
ntest_broad = length(chroms_broad) * length(genes)
ntest_specific = (length(chroms) - length(chroms_broad)) * length(genes)
res = res %>%
mutate(category = ifelse(str_detect(chrom, '_'), 'broad', 'specific')) %>%
mutate(q = ifelse(
category == 'broad',
p.adjust(p, method = 'BH', n = ntest_broad),
p.adjust(p, method = 'BH', n = ntest_specific)
)) %>%
mutate(chrom = str_remove(chrom, 'chr')) %>%
# mutate(q = p.adjust(p, method = 'BH', n = ntest)) %>%
mutate(logOR = log2(OR))
ntest_broad
ntest_specific
limor = 10
width = 0.5
star_dodge = 0.3
key_width = 0.7
ddr = c('TP53', 'PPM1D', 'CHEK2', 'ATM')
splicing = c('SRSF2', 'SF3B1', 'U2AF1')
dna_methylation = c('TET2', 'DNMT3A')
chromatin = c('ASXL1', 'EZH2')
genes = unique(res$gene)
gene_locations = gene_bed %>% filter(Gene %in% genes)
res = res %>%
mutate(signif_cat = case_when(
q < 0.01 ~ 'q<0.01',
q < 0.05 ~ 'q<0.05',
q < 0.1 ~ 'q<0.1',
T ~ 'ns'
))
D = res %>%
rowwise() %>%
mutate(
type = str_split(chrom, '_')[[1]][2],
chrom = str_split(chrom, '_')[[1]][1]
) %>%
mutate(type = ifelse(is.na(type), 'any', type)) %>%
mutate(type = ifelse(chrom == 'loy', 'del', type)) %>%
ungroup() %>%
mutate(chrom = ifelse(chrom == 23, 'X', as.character(chrom))) %>%
mutate(
chrom = factor(chrom, gtools::mixedsort(unique(chrom))),
gene = factor(gene, rev(genes))
) %>%
mutate(
chrom_i = as.integer(chrom),
gene_i = as.integer(gene)
)
D = D %>% mutate(gene_function = case_when(
gene %in% dna_methylation ~ 'DNA methylation',
gene %in% splicing ~ 'Splicing',
gene %in% chromatin ~ 'Chromatin',
gene %in% ddr ~ 'DDR',
T ~ 'Other'
))
p_main = D %>%
rbind(
mutate(.,
chrom_i = case_when(
type == 'amp' ~ chrom_i - width,
type == 'loh' ~ chrom_i - width,
type == 'del' ~ chrom_i + width,
type == 'any' ~ chrom_i - width,
),
gene_i = case_when(
type == 'amp' ~ gene_i + width,
type == 'loh' ~ gene_i + width,
type == 'del' ~ gene_i + width,
type == 'any' ~ gene_i - width,
)
),
mutate(.,
chrom_i = case_when(
type == 'amp' ~ chrom_i - width,
type == 'loh' ~ chrom_i + width,
type == 'del' ~ chrom_i + width,
type == 'any' ~ chrom_i + width,
),
gene_i = case_when(
type == 'amp' ~ gene_i - width,
type == 'loh' ~ gene_i + width,
type == 'del' ~ gene_i - width,
type == 'any' ~ gene_i - width,
)
)
) %>%
rowwise() %>% mutate(OR = min(OR, limor), logOR = log2(OR)) %>% ungroup() %>%
ggplot(
aes()
) +
theme_classic() +
geom_polygon(
aes(x = chrom_i, y = gene_i, group = paste0(gene, chrom, type), fill = OR),
color = 'white',
size = 0.5,
# alpha = 1
) +
geom_vline(data = D, aes(xintercept = chrom_i + 0.5), color = 'gray95') +
geom_hline(data = D, aes(yintercept = gene_i + 0.5), color = 'gray95') +
geom_point(
aes(x = case_when(
type == 'loh' ~ as.numeric(chrom_i),
type == 'del' ~ as.numeric(chrom_i) + star_dodge,
type == 'amp' ~ as.numeric(chrom_i) - star_dodge,
type == 'any' ~ as.numeric(chrom_i)
),
y = case_when(
type == 'loh' ~ gene_i + star_dodge,
type == 'del' ~ as.numeric(gene_i),
type == 'amp' ~ as.numeric(gene_i),
type == 'any' ~ as.numeric(gene_i) - star_dodge
),
pch = signif_cat
),
data = D %>% filter(q < 0.1),
color = 'black',
size = 1.3
) +
geom_tile(
data = gene_locations %>%
mutate(
chrom_i = as.integer(factor(chrom, levels(D$chrom))),
gene_i = as.integer(factor(Gene, levels(D$gene)))
),
aes(x = chrom_i, y = gene_i, linetype = ' '),
alpha = 0,
color = 'black',
size = 0.5
) +
scale_fill_gradient2(low = 'blue', high = 'indianred2', mid = 'white', na.value = 'white', midpoint = 1, limits = c(0, limor)) +
# scale_fill_manual(values = c(cnv_pal, 'any' = 'darkorange')) +
ylab('') +
xlab('') +
scale_x_continuous(
expand = expansion(add = 0.05),
labels = levels(D$chrom),
breaks = 1:length(levels(D$chrom)),
position = "top"
) +
scale_y_continuous(
expand = expansion(add = 0.05), labels = levels(D$gene), breaks = 1:length(levels(D$gene))
) +
theme(
plot.margin = unit(c(0,0,0,0), 'mm'),
axis.text.y = element_text(
face = 'bold.italic',
color = case_when(
as.character(levels(D$gene)) %in% dna_methylation ~ 'royalblue',
as.character(levels(D$gene)) %in% splicing ~ 'firebrick',
as.character(levels(D$gene)) %in% chromatin ~ 'darkolivegreen3',
as.character(levels(D$gene)) %in% ddr ~ 'goldenrod2',
T ~ 'black'
)
),
axis.text.x = element_text(
face = 'bold'
),
legend.key.size = unit(0.3, 'cm'),
legend.position = 'right',
panel.border = element_rect(color = 'gray', fill = NA),
axis.line = element_line(color = 'gray'),
legend.text = element_text(size = 8),
legend.title = element_text(size = 8),
legend.spacing = unit(2, 'mm')
) +
scale_shape_manual(values = c('q<0.01' = 8, 'q<0.05' = 3, 'q<0.1' = 20), name = 'FDR') +
geom_text(
data = D %>% distinct(gene, gene_function) %>% mutate(gene_function = relevel(factor(gene_function), 'Other')),
aes(x = 1,
y = 1,
color = gene_function,
label = gene),
size = 0,
key_glyph = draw_key_rect
) +
# scale_alpha_continuous(range = c(0, 0.5)) +
scale_color_manual(
values = c(
'DNA methylation' = 'royalblue',
'Splicing' = 'firebrick',
'Chromatin' = 'darkolivegreen3',
'DDR' = 'goldenrod2',
'Other' = 'black'
)
) +
guides(
color = guide_legend(keyheight = key_width, keywidth = key_width, title = 'Gene function', override.aes = list(size = 3)),
# alpha = guide_legend(keyheight = key_width, keywidth = key_width),
# fill = guide_legend(keyheight = key_width, keywidth = key_width, override.aes = list(alpha = 0.5)),
linetype = guide_legend(title = 'Gene location', keyheight = key_width, keywidth = key_width)
)
do_plot(p_main, 'trans_heatmap.pdf', w = 6.5, h = 4)
segs_filtered_auto = segs_filtered_auto %>% mutate(
trans_effect =
(chrom == 3 & type == 'amp' & TP53) |
(chrom == 4 & type == 'del' & (!TET2) & (SRSF2 | ATM | CHEK2 | ASXL1)) |
(chrom == 5 & type == 'del' & TP53) |
(chrom == 7 & type == 'del' & (TP53 | PPM1D)) |
(chrom == 8 & type == 'amp' & TET2) |
(chrom == 12 & type == 'amp' & (NOTCH1 | MYD88 | FBXW7)) |
(chrom == 13 & type == 'del' & ATM) |
(chrom == 20 & type == 'del' & (U2AF1 | TERT))
)
trans_total = sum(segs_filtered_auto$trans_effect)
trans_del = sum(segs_filtered_auto %>% filter(type == 'del') %>% pull(trans_effect))
trans_amp = sum(segs_filtered_auto %>% filter(type == 'amp') %>% pull(trans_effect))
trans_total %>% paste0(' total trans events')
trans_del %>% paste0(' total trans deletions')
trans_amp %>% paste0(' total trans amps')
(trans_total/nrow(segs_filtered_auto)) %>% signif(3) %>% {.*100} %>% paste0('% of total events')
(trans_del/sum(segs_filtered_auto$type == 'del')) %>% signif(3) %>% {.*100} %>% paste0('% of total deletions')
(trans_amp/sum(segs_filtered_auto$type == 'amp')) %>% signif(3) %>% {.*100} %>% paste0('% of total amps')
acen = fread('/home/gaot/ref/acen.tsv')
colnames(acen) = c('chrom', 'start', 'end', 'arm', 'type')
chrom_lens = fread('/home/gaot/ref/chrom_lens.tsv') %>% filter(chrom != 24)
chrom_levels = c(as.character(1:22), 'X')
acen = acen %>%
mutate(chrom = str_remove(chrom, 'chr')) %>%
filter(chrom != 'Y') %>%
mutate(
chrom = ifelse(chrom == 'X', 23, chrom),
chrom = as.integer(chrom)
) %>%
arrange(chrom) %>%
group_by(chrom) %>%
summarise(cen_start = min(start), cen_end = max(end)) %>%
ungroup()
cytoband = fread('./cytoBand.txt') %>% setNames(c('chrom', 'start', 'end', 'name', 'stain'))
cytoband = cytoband %>%
mutate(chrom = str_remove(chrom, 'chr')) %>%
mutate(chrom = ifelse(chrom == 'X', 23, chrom)) %>%
filter(chrom != 'Y') %>%
mutate(chrom = as.integer(chrom))
cyto_colors = c(
'gpos100'= rgb(0/255.0,0/255.0,0/255.0),
'gpos' = rgb(0/255.0,0/255.0,0/255.0),
'gpos75' = rgb(130/255.0,130/255.0,130/255.0),
'gpos66' = rgb(160/255.0,160/255.0,160/255.0),
'gpos50' = rgb(200/255.0,200/255.0,200/255.0),
'gpos33' = rgb(210/255.0,210/255.0,210/255.0),
'gpos25' = rgb(200/255.0,200/255.0,200/255.0),
'gvar' = rgb(220/255.0,220/255.0,220/255.0),
'gneg' = rgb(255/255.0,255/255.0,255/255.0),
'acen' = rgb(217/255.0,47/255.0,39/255.0),
'stalk' = rgb(100/255.0,127/255.0,164/255.0)
)
window = 5e6
M_overlap = M_long %>%
filter(dmp_patient_id %in% segs_filtered$dmp_patient_id) %>%
rowwise() %>%
filter(
any(chrom == segs_filtered$chrom & dmp_patient_id == segs_filtered$dmp_patient_id &
start >= segs_filtered$start - window & start <= segs_filtered$end + window)
) %>%
ungroup() %>%
mutate(VariantClass = case_when(
VariantClass %in% c('Silent', 'Intron') ~ 'noncoding',
T ~ 'coding'
))
p_segs = segs_filtered %>%
mutate(chrom = factor(ifelse(chrom == 23, 'X', as.character(chrom)), chrom_levels)) %>%
group_by(dmp_sample_id, chrom) %>%
mutate(total_length = sum(size)) %>%
mutate(type2 = factor(type2, c('AMP', 'CNLOH', 'DEL'))) %>%
arrange(type2, round(start/1e7), -total_length) %>%
group_by(chrom) %>%
mutate(index = as.integer(factor(dmp_sample_id, unique(dmp_sample_id)))) %>%
# left_join(
# M_overlap %>% select(dmp_patient_id, chrom, sm_pos = start, Gene, VariantClass, VAF_N),
# by = c('chrom', 'dmp_patient_id')
# ) %>%
ungroup() %>%
ggplot(
aes(x = start, xend = end, y = index, yend = index, color = type2)
) +
geom_rect(
inherit.aes = F,
data = chrom_lens %>% mutate(chrom = factor(ifelse(chrom == 23, 'X', as.character(chrom)), chrom_levels)),
aes(xmin = 0, xmax = length, ymin = -2, ymax = -0.5),
color = 'black',
fill = 'white',
size = 1
) +
geom_rect(
data = cytoband %>% mutate(chrom = factor(ifelse(chrom == 23, 'X', as.character(chrom)), chrom_levels)),
inherit.aes = F,
aes(xmin = start, xmax = end, ymin = -2, ymax = -0.5, fill = stain),
size = 1,
show.legend = FALSE
) +
geom_segment(
lineend = 'round',
size = 1
) +
scale_fill_manual(values = cyto_colors) +
theme_void() +
theme(
axis.text = element_blank(),
axis.ticks = element_blank(),
axis.title = element_blank(),
legend.position = 'right',
panel.spacing.y = unit(0, "lines"),
plot.margin = unit(c(0,0,0,0), "lines"),
strip.text.y = element_text(size = 10, hjust = 0.5, angle = 0),
panel.border = element_blank(),
strip.background = element_blank()
) +
scale_y_discrete(expand = expansion(add = 1)) +
scale_x_discrete(expand = c(0.02,0)) +
facet_grid(.~chrom, space = 'free', scales = 'free', switch="both") +
xlab('') +
ylab('') +
scale_color_manual(values = cnv_pal2) +
labs(color = '')
do_plot(p_segs, 'genome_wide.pdf', w = 20, h = 2.2)
cnv_pal2
impact_bed = fread('/work/isabl/data/techniques/51/bed_files/GRCh37/dna-td-impact-468-grch37.targets.bed.gz')
colnames(impact_bed) = c('chrom', 'start', 'end', 'strand', 'target')
impact_bed = impact_bed %>% filter(chrom != 'Y')
impact_bed = impact_bed %>% mutate(chrom = as.integer(ifelse(chrom == 'X', 23, chrom)))
focal_dels = function(gene) {
all_colors = c(
'amp' = 'darkblue', 'loh' = 'darkgreen', 'del' = 'darkred',
'Regulatory' = 'darkturquoise', 'Missense' = 'salmon', 'Truncating' = 'purple')
gene_structure = gene_structures %>% filter(str_detect(description, paste0('"', gene, '"')))
gene_loc = gene_bed %>%
filter(Gene == gene) %>%
as.data.frame(stringsAsFactors = F) %>%
mutate(loc = as.integer((start + end) / 2), chrom = as.integer(chrom))
this_chrom_lens = chrom_lens %>% filter(chrom == gene_loc$chrom)
acen_chrom = acen %>% filter(chrom == gene_loc$chrom)
D = segs_filtered %>%
filter(type == 'del' & start < gene_loc$start + 5e6 & end > gene_loc$end - 5e6) %>%
filter(chrom == gene_loc$chrom) %>%
mutate(chrom = factor(chrom)) %>%
group_by(dmp_sample_id, chrom) %>%
mutate(total_length = sum(size)) %>%
mutate(type = factor(type, c('amp', 'loh', 'del'))) %>%
arrange(desc(type), round(start/1e7), -total_length) %>%
group_by(chrom) %>%
mutate(index = as.integer(factor(dmp_sample_id, unique(dmp_sample_id)))) %>%
ungroup()
p_segs = ggplot(
D,
aes(x = start, xend = end, y = index, yend = index, color = type)
) +
geom_rect(
inherit.aes = F,
data = acen_chrom,
aes(xmin = cen_start, xmax = cen_end, ymin = -Inf, ymax = Inf),
color = 'gray',
size = 0,
alpha = 0.3
) +
geom_segment(
size = 1,
alpha = 0.8,
lineend = 'round'
) +
geom_segment(
inherit.aes = F,
data = this_chrom_lens,
aes(x = 0, xend = length, y = 0, yend = 0),
alpha = 0,
size = 0,
color = 'white'
) +
scale_alpha(
range = c(0, 1)
) +
theme_classic() +
theme(
axis.text = element_blank(),
axis.ticks = element_blank(),
axis.title = element_blank(),
legend.position = 'none',
panel.spacing.y = unit(1, "mm"),
plot.margin = unit(c(0.5,0.2,0.2,0.2), "lines"),
strip.text.y = element_blank(),
panel.border = element_rect(size = 0.2, fill = NA, color = 'gray'),
strip.background = element_rect(size = 0.5, fill = 'seashell3', color = 'seashell3'),
plot.title = element_text(size = 7)
) +
scale_y_discrete(expand = expansion(add = 1)) +
xlab('') +
ylab('') +
scale_color_manual(values = all_colors) +
ggtitle(paste0(gene, ', ', 'chr', gene_loc$chrom)) +
# exon structures
geom_segment(
inherit.aes = F,
data = gene_structure %>% mutate(zoom = TRUE),
aes(x = min(start), xend = max(end), y = -1, yend = -1),
size = 0.1,
color = 'black'
) +
geom_segment(
inherit.aes = F,
data = gene_structure %>% mutate(zoom = TRUE),
aes(x = start, xend = end, y = -1, yend = -1),
size = 2,
color = 'black'
) +
facet_zoom(
zoom.size = 1,
zoom.data = zoom,
xlim = c(gene_loc$start, gene_loc$end)
)
p_segs
}
focal_genes = c('DNMT3A', 'TET2', 'ATM', 'ETV6', 'NF1', 'CHEK2')
plots = list()
for (gene in focal_genes) {
plots[[gene]] = focal_dels(gene)
}
p_focal = plot_grid(plotlist = plots, nrow = 2)
do_plot(p_focal, 'focal_del.pdf', w = 4, h = 4)
window = 5e6
segs_filtered_auto = segs_filtered %>% filter(chrom != 23)
M_overlap = M_long %>%
filter(dmp_patient_id %in% segs_filtered_auto$dmp_patient_id) %>%
rowwise() %>%
mutate(
locality = ifelse(
any(chrom == segs_filtered_auto$chrom & dmp_patient_id == segs_filtered_auto$dmp_patient_id &
start >= segs_filtered_auto$start - window & start <= segs_filtered_auto$end + window
),
'cis',
'trans'
)
) %>%
ungroup() %>%
mutate(variant_type = case_when(
VariantClass %in% c('Silent', 'Intron') ~ 'noncoding',
T ~ 'coding'
))
perm = fread('./data/permutation.tsv', sep = '\t')
# perm = data.frame()
# for (i in 1:2000) {
# if (i %% 500 == 0) {
# display(i)
# }
# res = M_overlap %>%
# mutate(dmp_patient_id = sample(dmp_patient_id)) %>%
# rowwise() %>%
# mutate(
# locality = ifelse(
# any(chrom == segs_filtered_auto$chrom & dmp_patient_id == segs_filtered_auto$dmp_patient_id &
# start >= segs_filtered_auto$start - window & start <= segs_filtered_auto$end + window
# ),
# 'cis',
# 'trans'
# )
# ) %>%
# ungroup() %>%
# group_by(variant_type) %>%
# summarise(
# prop_cis = sum(locality == 'cis')/n(),
# .groups = 'drop'
# ) %>%
# mutate(i = i)
# perm = perm %>% rbind(res)
# }
prop_cis_coding = M_overlap %>% filter(variant_type == 'coding') %>% pull(locality) %>% {sum(. == 'cis')/length(.)}
prop_cis_noncoding = M_overlap %>% filter(variant_type == 'noncoding') %>% pull(locality) %>% {sum(. == 'cis')/length(.)}
prop_coding_nonrecurrent = M_overlap %>%
filter(variant_type == 'coding') %>%
filter(!Gene %in% c('MPL', 'DNMT3A', 'TET2', 'EZH2', 'JAK2', 'ATM', 'TP53')) %>%
pull(locality) %>% {sum(. == 'cis')/length(.)}
prop_cis_coding %>% paste('cis coding')
prop_coding_nonrecurrent %>% paste('cis coding non recurrent')
prop_cis_noncoding %>% paste('cis noncoding')
p_coding = perm %>% filter(variant_type == 'coding') %>% pull(prop_cis) %>% {sum(. > prop_cis_coding)/length(.)}
p_noncoding = perm %>% filter(variant_type == 'noncoding') %>% pull(prop_cis) %>% {sum(. > prop_cis_noncoding)/length(.)}
D = M_overlap %>%
group_by(variant_type) %>%
summarise(
prop_cis_median = sum(locality == 'cis')/n()
) %>%
mutate(
prop_cis_lower = prop_cis_median,
prop_cis_upper = prop_cis_median,
estimate = 'observed'
) %>%
rbind(
perm %>%
group_by(variant_type) %>%
summarise(
prop_cis_lower = quantile(prop_cis, c(.05))[1],
prop_cis_upper = quantile(prop_cis, c(.95))[1],
prop_cis_median = median(prop_cis)
) %>%
mutate(
estimate = 'expected'
)
) %>%
mutate(
variant_type = Hmisc::capitalize(variant_type)
)
p_perm = ggplot(
D,
aes(x = variant_type, y = prop_cis_median, ymin = prop_cis_lower, ymax = prop_cis_upper, pch = estimate)
) +
geom_pointrange() +
theme_classic() +
scale_shape_manual(values=c(16, 5)) +
theme(legend.position = 'right') +
ylab('Proportion of cis locality') +
ylim(0, 0.18) +
xlab('') +
# stat_compare_means(
# comparisons = list(c('coding', 'noncoding')),
# label.y = 0.14,
# size = 0
# )
annotate('text', x = 1, y = 0.18, label = glue('p<0.001'), size = 3) +
annotate('text', x = 2, y = 0.18, label = glue('p={signif(p_noncoding, 2)}'), size = 3) +
theme(legend.title = element_blank())
do_plot(p_perm, 'permutation.pdf', w = 3, h = 2)
options(repr.plot.width = 4, repr.plot.height = 2.5, repr.plot.res = 300)
p_dev = ggplot(
segs_filtered,
aes(x = valor, y = cnlr_adj, color = type)
) +
scale_color_manual(values = cnv_pal) +
# geom_hline(yintercept = 0, linetype = 'dashed', alpha = 0.8, color = 'darkgreen') +
geom_point(pch = 21, stroke = 0.5) +
scale_x_continuous(limits = c(0, 2.5), expand = c(0,0)) +
scale_y_continuous(limits = c(-1, 0.6), expand = c(0,0)) +
theme_classic() +
theme(
legend.position = 'none',
axis.title = element_text(size = 8)
) +
# geom_text(size = 2) +
geom_line(
inherit.aes = F,
data = del_line,
aes(x = valor, y = cnlr),
linetype = 'dashed', alpha = 0.8, color = 'darkred'
) +
geom_line(
inherit.aes = F,
data = amp_line,
aes(x = valor, y = cnlr),
linetype = 'dashed', alpha = 0.8, color = 'darkblue'
) +
geom_line(
inherit.aes = F,
data = loh_line,
aes(x = valor, y = cnlr),
linetype = 'dashed', alpha = 0.8, color = 'darkgreen'
) +
xlab('logOR') +
ylab('logR') +
annotate('text', x = 0.9, y = 0.5, label = 'AMP', color = cnv_pal['amp'], size = 3) +
annotate('text', x = 2.1, y = 0.2, label = 'CNLOH', color = cnv_pal['loh'], size = 3) +
annotate('text', x = 2, y = -0.5, label = 'DEL', color = cnv_pal['del'], size = 3)
# annotate('text', x = 0.2, y = -0.8, label = paste0('n=', nrow(segs_filtered)), color = 'gray30', size = 3)
p_dev
options(repr.plot.width = 5, repr.plot.height = 2, repr.plot.res = 300)
p_age = ggplot(
M_wide %>%
mutate(age_bin = cut(age, c(0, seq(40, 80, 10), 100))) %>%
group_by(age_bin) %>%
summarise(
prop_ch_cnv = sum(ch_cnv)/n(),
prop_ch_cnv_auto = sum(ch_cnv_auto)/n(),
prop_both_ch = sum(ch_cnv & ch_nonsilent)/n(),
n = n()
),
aes(x = age_bin, y = prop_ch_cnv, group = '1')
) +
geom_line() +
theme_classic() +
geom_point() +
ylim(0, 0.04) +
theme(
axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1, size = 8),
axis.title = element_text(size = 8)
) +
ylab('Proportion with mCA')
p_age
options(repr.plot.width = 3, repr.plot.height = 3, repr.plot.res = 300)
give.min <- function(x){
return(c(y = min(x), label = min(x)))
}
p_phi = ggplot(
segs_filtered,
aes(x = type2, y = phi, fill = type2, color = type2)
) +
# geom_hline(yintercept = 0.1, linetype = 'dashed', color = 'gray', size = 0.5, alpha = 0.8) +
geom_violin(size = 0.3, position = position_dodge(width = 0.75), alpha = 0.2, scale = 'width') +
theme_classic() +
geom_quasirandom(width = 0.1, method = 'smiley', size = 0.5) +
scale_fill_manual(values = cnv_pal2) +
scale_color_manual(values = cnv_pal2) +
ylim(0,NA) +
theme(
legend.position = 'none',
axis.title = element_text(size = 8),
axis.text.x = element_text(size = 8, angle = 30, hjust = 1, vjust = 1)
) +
ylab('Aberrant cell fraction') +
xlab('Event type') +
stat_summary(fun.data = give.min, geom = "text", vjust = 1.5, size = 2.5)
p_phi
options(repr.plot.width = 1.5, repr.plot.height = 2, repr.plot.res = 300)
p_n = ggplot(
segs_filtered %>% group_by(dmp_sample_id) %>% mutate(n_sample = n()) %>%
ungroup %>% count(n_sample, active_heme, type) %>% mutate(n = n/n_sample) %>%
mutate(n_sample = case_when(
n_sample < 3 ~ as.character(n_sample), T ~ '3+')),
aes(x = n_sample, y = n, fill = type)
) +
theme_classic() +
geom_bar(stat = 'identity', position = 'stack') +
scale_fill_manual(values = cnv_pal) +
theme(
legend.position = 'none',
axis.title = element_text(size = 8)
) +
xlab('Events per sample')
p_n
panel = p_age + p_n + p_dev + p_phi + plot_layout(widths = c(1.8,1))
do_plot(panel, 'ai_general.pdf', w = 3.8, h = 3.3)
options(repr.plot.width = 5, repr.plot.height = 2, repr.plot.res = 300)
pval = 0.01
label_size = 8
p_mutnum = ggplot(
M_wide %>%
mutate(mutnum_silent = mutnum_silent + mutnum_intronic) %>%
mutate(ch_cnv = ifelse(ch_cnv == 1, 'mCA+', 'mCA-')) %>%
reshape2::melt(
measure.vars = c('mutnum_all', 'mutnum_nonsilent', 'mutnum_silent'),
variable.name = 'mutation_type',
value.name = 'mutnum'
) %>%
filter(mutation_type == 'mutnum_all') %>%
mutate(mutnum = ifelse(mutnum >= 3, '3+', as.character(mutnum))) %>%
count(ch_cnv, mutnum, mutation_type) %>%
group_by(ch_cnv) %>%
mutate(prop = n/sum(n)) %>%
ungroup() %>%
filter(mutnum > 0),
aes(x = ch_cnv, y = prop, fill = mutnum)
) +
geom_col(color = 'black', size = 0.2) +
theme_classic() +
scale_fill_manual(values = scales::seq_gradient_pal("white", sm_col)(c(0.3, 0.6, 1))) +
scale_y_continuous(expand = expansion(add = 0.05), limits = c(0,0.85)) +
stat_compare_means(
comparisons = list(c('mCA+', 'mCA-')),
label.y = 0.8,
size = 0,
method = 't.test'
) +
annotate('text', x = 1.5, y = 0.8, label = '****', size = 3) +
xlab('') +
theme(
legend.key.size = unit(3, 'mm'),
legend.title = element_text(size = 7),
legend.position = 'top',
legend.spacing = unit(0,'mm'),
axis.text.x = element_text(size = 8, angle = 30, hjust = 1)
) +
guides(fill = guide_legend(title.position = "top"))
p_vaf = ggplot(
M_wide %>%
rowwise() %>% mutate(VAF_silent = max(VAF_silent, VAF_intronic)) %>% ungroup() %>%
mutate(ch_cnv = ifelse(ch_cnv == 1, 'mCA+', 'mCA-')) %>%
reshape2::melt(
measure.vars = c('VAF_all', 'VAF_nonsilent', 'VAF_silent'),
variable.name = 'mutation_type',
value.name = 'VAF'
) %>%
filter(VAF > 0 & VAF < 0.5 & mutation_type == 'VAF_all'),
aes(x = ch_cnv, y = VAF, fill = ch_cnv)
) +
geom_violin(draw_quantiles = c(0.5)) +
theme_classic() +
stat_compare_means(
size = 2.5,
method = 't.test',
label = 'p.signif',
comparisons = list(c('mCA+', 'mCA-'))) +
ylim(0,0.55) +
theme(
axis.text.x = element_text(size = 8, angle = 30, hjust = 1),
legend.position = 'none'
) +
xlab('') +
scale_fill_manual(values = c(cnv_col, 'gainsboro'))
D = M_wide %>%
mutate(ch_status = case_when(
ch_cnv & ch_nonsilent ~ 'Both',
ch_cnv == 1 ~ 'mCA only',
ch_nonsilent == 1 ~ 'GM only',
T ~ 'none'
)) %>%
group_by(ch_status) %>%
mutate(n = n()) %>%
ungroup() %>%
filter(ch_status != 'none')
give.n <- function(x){
return(c(y = min(x), label = length(x)))
}
p_age = ggplot(
D,
aes(x = ch_status, y = age, fill = ch_status)
) +
# geom_violin(draw_quantiles = c(0.5)) +
geom_boxplot(outlier.alpha = 0) +
theme_classic() +
stat_compare_means(
comparisons = list(
c('mCA only', 'GM only'),
c('Both', 'GM only'),
c('mCA only', 'Both')
),
size = 2.2,
method = 't.test', label = 'p.signif',
vjust = 0,
step.increase = 0.15
) +
ylim(NA, 130) +
theme(
# plot.margin = margin(t = 0, r = 0, b = 0, l = 0, unit = "pt"),
# axis.title.x = element_blank()
axis.text.x = element_text(size = 8, angle = 30, hjust = 1),
legend.position = 'none'
) +
xlab('') +
scale_fill_manual(values = c('mCA only' = cnv_col, 'Both' = 'darkorange', 'GM only' = sm_col)) +
ylab('Age')
# p_phi = ggplot(
# segs_filtered %>%
# mutate(ch_nonsilent = ifelse(ch_nonsilent == 1, 'GM+', 'GM-')),
# aes(x = ch_nonsilent, y = phi, fill = ch_nonsilent)
# ) +
# geom_violin(
# draw_quantiles = c(0.5),
# scale = 'width',
# trim = T
# ) +
# theme_classic() +
# stat_compare_means(
# size = 2.5,
# method = 't.test',
# label = 'p.value',
# label.x = 1.5,
# comparisons = list(c('GM+', 'GM-'))
# ) +
# ylim(0,1) +
# xlab('') +
# ylab('Cell fraction') +
# theme(
# axis.text.x = element_text(size = 8, angle = 30, hjust = 1),
# legend.position = 'none'
# ) +
# xlab('') +
# scale_fill_manual(values = c(sm_col, 'gainsboro'))
do_plot(p_age, 'age.pdf', w = 1.5, h = 1.8)
options(repr.plot.width = 6, repr.plot.height = 2, repr.plot.res = 300)
ggplot(
segs_filtered %>%
mutate(age_bin = cut(age, c(20, seq(40, 80, 10), Inf))) %>%
group_by(age_bin, type) %>%
summarise(
ci_lower = mean(phi) - sd(phi),
ci_upper = mean(phi) + sd(phi),
phi = mean(phi),
n = n()
),
aes(x = age_bin, y = phi, ymin = ci_lower, ymax = ci_upper, group = type, color = type, fill = type)
) +
geom_line() +
theme_classic() +
geom_point() +
# geom_errorbar(size = 0.5, width = 0.1) +
geom_ribbon(alpha = 0.2, size = 0) +
scale_color_manual(values = c('loh' = 'darkgreen', 'amp' = 'darkblue', 'del' = 'darkred', 'err' = 'gray')) +
scale_fill_manual(values = c('loh' = 'darkgreen', 'amp' = 'darkblue', 'del' = 'darkred', 'err' = 'gray')) +
facet_wrap(~type) +
theme(
axis.text.x = element_text(angle = 90, vjust = 0.5),
strip.background = element_blank()
)
segs_laurie = fread('./data/external/segs_laurie.txt')
segs_laurie = segs_laurie %>%
mutate(
size = length * 1e6,
type = c('aupd' = 'loh', 'gain' = 'amp', 'loss' = 'del')[mosaic.type],
start = `Start (GRCh37)`,
end = `End (GRCh37)`,
chrom = chromosome,
patient_id = as.character(subject.id)
) %>%
mutate(n_total = 50222)
segs_loh = fread('./data/external/segs_loh.txt') %>%
filter(COPY_CHANGE != 'undetermined') %>%
mutate(
chrom = as.integer(ifelse(CHR == 'X', 23, CHR)),
size = SIZE_MB * 1e6,
start = `Start (GRCh37)`,
end = `End (GRCh37)`,
type = c('neutral' = 'loh', 'loss' = 'del', 'gain' = 'amp')[COPY_CHANGE],
phi = as.numeric(CELL_FRAC),
patient_id = as.character(ID)
) %>%
mutate(n_total = 151202)
segs_jacobs = fread('./data/external/segs_jacobs.txt') %>%
mutate(
chrom = CHROM,
start = `Start (GRCh37)`,
end = `End (GRCh37)`,
size = SEG_SIZE,
type = c('NEUTRAL' = 'loh', 'LOSS' = 'del', 'GAIN' = 'amp')[STATE],
phi = PROPORTION_ABNORMAL,
patient_id = as.character(ID)
) %>%
mutate(n_total = 31717 + 26136)
segs_machiela = fread('./data/external/segs_machiela.txt') %>%
mutate(
chrom = as.integer(CHR),
start = `Start (GRCh37)`,
end = `End (GRCh37)`,
size = SEG_SIZE,
type = c('Neutral' = 'loh', 'Loss' = 'del', 'Gain' = 'amp')[STATE],
phi = PERCENT,
age = as.numeric(AGE),
patient_id = as.character(ID)
) %>%
select(-AGE, -CHR) %>%
filter(!is.na(type)) %>%
mutate(n_total = 24849)
segs_all = bind_rows(
segs_filtered %>% mutate(study = 'This study') %>%
mutate(n_total = 32444) %>%
mutate(patient_id = dmp_patient_id),
segs_loh %>% mutate(study = 'Loh'),
segs_laurie %>% mutate(study = 'Laurie'),
segs_jacobs %>% mutate(study = 'Jacobs'),
segs_machiela %>% mutate(study = 'Machiela')
) %>%
mutate(study_label = paste0(study, '\n(n=', n_total, ')')) %>%
mutate(study_label = factor(study_label, unique(study_label))) %>%
mutate(study = factor(study, unique(study))) %>%
mutate(type2 = factor(c('del' = 'DEL', 'loh' = 'CNLOH', 'amp' = 'AMP')[type], c('DEL', 'CNLOH', 'AMP')))
study_info = data.frame(
'Study' = c('This study', 'Loh et al.', 'Laurie et al.', 'Jacobs et al.', 'Machiela et al. (TSGII)', 'Jaiswal et al.', 'Genovese et al.', 'Zink et al.', 'Terao et al.', 'Loh et al.'),
'Year' = c(2020, 2018, 2012, 2012, 2015, 2014, 2014, 2017, 2020, 2020),
'Genotyping Platform' = c('Targeted Sequencing', 'SNP-array', 'SNP-array', 'SNP-array', 'SNP-array', 'WES', 'WES', 'WGS', 'SNP-array', 'SNP-array/WES(partial)'),
'Mutation Type' = c('mCA/GM(468 genes)', 'mCA', 'mCA', 'mCA', 'mCA', 'GM', 'GM', 'GM', 'mCA', 'mCA/GM(3 genes)'),
'Sample Size' = c('32,442', '151,202', '50,222', '57,853', '24,849', '17,182', '12,380', '11,262', '179,417', '482,789/49,960'),
'Cohort Type' = c('Cancer', 'Normal', 'Normal', 'Cancer/Normal', 'Cancer/Normal', 'Normal', 'Normal', 'Normal', 'Normal', 'Normal'),
'Haplotype Info' = c('No', 'Yes', 'No', 'No', 'No', '-', '-', '-', 'Yes', 'Yes'),
check.names = FALSE
) %>%
arrange(Year, Study == 'This study')
kb = study_info %>%
kable(format = "html", booktabs = T) %>%
row_spec(0:nrow(study_info), background = 'white', align = 'center', extra_css = "border-bottom: 1px solid") %>%
row_spec(0, background = 'gray', extra_css = "border-bottom: 2px solid") %>%
kable_styling(position = 'center', full_width = FALSE)
kb %>% toString() %>%
display_html()
#save_kable(kb, './figures/studies.pdf')
give.n <- function(x){
return(c(y = max(x), label = length(x)))
}
p = segs_all %>%
mutate(size = size/1e6) %>%
filter(chrom != '23') %>%
filter(phi >= 0.1 | is.na(phi)) %>%
ggplot(
aes(x = type2, y = size, fill = type2)
) +
geom_violin(alpha = 0.8, trim = F, draw_quantiles = c(0.5), scale = 'width', width = 0.8) +
theme_classic() +
scale_fill_manual(values = cnv_pal2) +
scale_color_manual(values = cnv_pal2) +
scale_y_continuous(trans = 'log10', limits = c(0.05, NA)) +
facet_wrap(~study, nrow = 1) +
# stat_summary(fun.data = give.n, geom = "text", hjust = 1.5, vjust = -1, size = 2.5) +
theme(panel.spacing = unit(1, 'mm'), legend.position = 'none') +
ylab('size in Mb') +
xlab('Event type')
do_plot(p, 'size.pdf', w = 8, h = 2.2)
give.min <- function(x){
return(c(y = min(x), label = round(min(x), 2)))
}
p = segs_all %>%
filter(!is.na(phi)) %>%
filter(study != 'Loh') %>%
ggplot(
aes(x = type2, y = phi, fill = type2, color = type2)
) +
geom_violin(size = 0.3, position = position_dodge(width = 0.75), alpha = 0.2, scale = 'width') +
theme_classic() +
geom_quasirandom(width = 0.1, method = 'smiley', size = 0.5) +
scale_fill_manual(values = cnv_pal2) +
scale_color_manual(values = cnv_pal2) +
scale_y_continuous(expand = expansion(add = 0.15)) +
theme(
axis.title = element_text(size = 8)
) +
# scale_y_continuous()
ylab('Aberrant cell fraction') +
xlab('Event type') +
stat_summary(fun.data = give.min, geom = "text", vjust = 1.5, size = 2.5) +
facet_wrap(~study, nrow = 1) +
theme(panel.spacing = unit(2, 'mm'), legend.position = 'none')
do_plot(p, 'cell_frac.pdf', w = 5, h = 2.2)
p = segs_all %>%
filter(chrom != '23') %>%
filter(phi >= 0.1 | is.na(phi)) %>%
group_by(study, chrom, type2) %>%
summarise(
n = length(unique(patient_id)),
prop = n/unique(n_total),
.groups = 'drop'
) %>%
tidyr::complete(chrom, type2, study, fill = list(n = 0, prop = 0)) %>%
ggplot(
aes(x = type2, y = prop, fill = study)
) +
scale_x_discrete(drop=FALSE, expand = expansion(add = 0.5)) +
geom_col(width = 0.5, position = position_dodge(width = 0.5)) +
facet_wrap(~chrom, scale = 'free') +
scale_y_continuous(labels = function(x) format(x, scientific = TRUE)) +
theme_classic() +
theme(
axis.text.y = element_text(size = 6)
) +
ylab('Proportion of subjects') +
xlab('Event type')
do_plot(p, 'comparison_incidence.pdf', w = 11, h = 6.5)
acen = fread('/home/gaot/ref/acen.tsv')
colnames(acen) = c('chrom', 'start', 'end', 'arm', 'type')
chrom_lens = fread('/home/gaot/ref/chrom_lens.tsv') %>% filter(chrom != 24)
acen = acen %>%
mutate(chrom = str_remove(chrom, 'chr')) %>%
filter(chrom != 'Y') %>%
mutate(
chrom = ifelse(chrom == 'X', 23, chrom),
chrom = as.integer(chrom)
) %>%
arrange(chrom) %>%
group_by(chrom) %>%
summarise(cen_start = min(start), cen_end = max(end), .groups = 'drop') %>%
ungroup()
cytoband = fread('./cytoBand.txt') %>% setNames(c('chrom', 'start', 'end', 'name', 'stain'))
cytoband = cytoband %>%
mutate(chrom = str_remove(chrom, 'chr')) %>%
mutate(chrom = ifelse(chrom == 'X', 23, chrom)) %>%
filter(chrom != 'Y') %>%
mutate(chrom = as.integer(chrom))
cyto_colors = c(
'gpos100'= rgb(0/255.0,0/255.0,0/255.0),
'gpos' = rgb(0/255.0,0/255.0,0/255.0),
'gpos75' = rgb(130/255.0,130/255.0,130/255.0),
'gpos66' = rgb(160/255.0,160/255.0,160/255.0),
'gpos50' = rgb(200/255.0,200/255.0,200/255.0),
'gpos33' = rgb(210/255.0,210/255.0,210/255.0),
'gpos25' = rgb(200/255.0,200/255.0,200/255.0),
'gvar' = rgb(220/255.0,220/255.0,220/255.0),
'gneg' = rgb(255/255.0,255/255.0,255/255.0),
'acen' = rgb(217/255.0,47/255.0,39/255.0),
'stalk' = rgb(100/255.0,127/255.0,164/255.0)
)
window = 1e7
M_overlap = M_long %>%
filter(dmp_patient_id %in% segs_filtered$dmp_patient_id) %>%
rowwise() %>%
filter(
any(chrom == segs_filtered$chrom & dmp_patient_id == segs_filtered$dmp_patient_id &
start >= segs_filtered$start - window & start <= segs_filtered$end + window)
) %>%
ungroup() %>%
mutate(VariantClass = case_when(
VariantClass %in% c('Silent', 'Intron') ~ 'noncoding',
T ~ 'coding'
))
p_segs = segs_all %>%
# arrange(study != 'Loh') %>%
mutate(study = factor(study, unique(study))) %>%
filter(phi >= 0.1 | is.na(phi)) %>%
filter(chrom != 23) %>%
group_by(patient_id, chrom) %>%
mutate(total_length = sum(size)) %>%
mutate(type2 = factor(type2, c('AMP', 'CNLOH', 'DEL'))) %>%
arrange(type2, round(start/1e7), -total_length) %>%
group_by(chrom, study) %>%
mutate(index = as.integer(factor(patient_id, unique(patient_id)))) %>%
filter(index < 100) %>%
ungroup() %>%
ggplot(
aes(x = start, xend = end, y = index, yend = index, color = type2)
) +
geom_rect(
inherit.aes = F,
data = chrom_lens %>% filter(chrom != 23),
aes(xmin = 0, xmax = length, ymin = -2, ymax = -0.5),
color = 'black',
fill = 'white',
size = 1
) +
geom_rect(
data = cytoband %>% filter(chrom != 23),
inherit.aes = F,
aes(xmin = start, xmax = end, ymin = -2, ymax = -0.5, fill = stain),
size = 1,
show.legend = FALSE
) +
geom_segment(
lineend = 'round',
size = 0.75
) +
scale_fill_manual(values = cyto_colors) +
theme_void() +
theme(
axis.text = element_blank(),
axis.ticks = element_blank(),
axis.title = element_blank(),
legend.position = 'right',
panel.spacing.y = unit(0.5, "lines"),
plot.margin = unit(c(0,0,0,0), "lines"),
panel.border = element_blank(),
strip.text.y = element_text(size = 14, hjust = 0.5, angle = 0),
strip.background.y = element_rect(size = 1, fill = NA),
strip.background.x = element_blank()
) +
scale_y_discrete(expand = expansion(add = 1)) +
scale_x_discrete(expand = c(0.02,0)) +
facet_grid(study~chrom, space = 'free', scales = 'free', switch="both") +
xlab('') +
ylab('') +
scale_color_manual(values = cnv_pal2, name = '')
do_plot(p_segs, 'comparison_genomewide.pdf', w = 16, h = 10)
## EDITED MS - Reviewer 2
acen = fread('/home/gaot/ref/acen.tsv')
colnames(acen) = c('chrom', 'start', 'end', 'arm', 'type')
chrom_lens = fread('/home/gaot/ref/chrom_lens.tsv') %>% filter(chrom != 24)
acen = acen %>%
mutate(chrom = str_remove(chrom, 'chr')) %>%
filter(chrom != 'Y') %>%
mutate(
chrom = ifelse(chrom == 'X', 23, chrom),
chrom = as.integer(chrom)
) %>%
arrange(chrom) %>%
group_by(chrom) %>%
summarise(cen_start = min(start), cen_end = max(end), .groups = 'drop') %>%
ungroup()
cytoband = fread('./cytoBand.txt') %>% setNames(c('chrom', 'start', 'end', 'name', 'stain'))
cytoband = cytoband %>%
mutate(chrom = str_remove(chrom, 'chr')) %>%
mutate(chrom = ifelse(chrom == 'X', 23, chrom)) %>%
filter(chrom != 'Y') %>%
mutate(chrom = as.integer(chrom))
cyto_colors = c(
'gpos100'= rgb(0/255.0,0/255.0,0/255.0),
'gpos' = rgb(0/255.0,0/255.0,0/255.0),
'gpos75' = rgb(130/255.0,130/255.0,130/255.0),
'gpos66' = rgb(160/255.0,160/255.0,160/255.0),
'gpos50' = rgb(200/255.0,200/255.0,200/255.0),
'gpos33' = rgb(210/255.0,210/255.0,210/255.0),
'gpos25' = rgb(200/255.0,200/255.0,200/255.0),
'gvar' = rgb(220/255.0,220/255.0,220/255.0),
'gneg' = rgb(255/255.0,255/255.0,255/255.0),
'acen' = rgb(217/255.0,47/255.0,39/255.0),
'stalk' = rgb(100/255.0,127/255.0,164/255.0)
)
window = 1e7
M_overlap = M_long %>%
filter(dmp_patient_id %in% segs_filtered$dmp_patient_id) %>%
rowwise() %>%
filter(
any(chrom == segs_filtered$chrom & dmp_patient_id == segs_filtered$dmp_patient_id &
start >= segs_filtered$start - window & start <= segs_filtered$end + window)
) %>%
ungroup() %>%
mutate(VariantClass = case_when(
VariantClass %in% c('Silent', 'Intron') ~ 'noncoding',
T ~ 'coding'
))
p_segs = segs_all %>%
filter(study %in% c("This study", "Loh")) %>%
# arrange(study != 'Loh') %>%
mutate(study = case_when(
study == "Loh" & phi < 0.1 ~ "Loh <10%",
study == "Loh" & phi >= 0.1 ~ "Loh >=10%",
T ~ "This study") ) %>%
mutate(study = factor(study, unique(study))) %>%
#filter(phi >= 0.1 | is.na(phi)) %>%
filter(chrom != 23) %>%
group_by(patient_id, chrom) %>%
mutate(total_length = sum(size)) %>%
mutate(type2 = factor(type2, c('AMP', 'CNLOH', 'DEL'))) %>%
arrange(type2, round(start/1e7), -total_length) %>%
group_by(chrom, study) %>%
mutate(index = as.integer(factor(patient_id, unique(patient_id)))) %>%
# filter(index < 100) %>%
ungroup() %>%
ggplot(
aes(x = start, xend = end, y = index, yend = index, color = type2)
) +
geom_rect(
inherit.aes = F,
data = chrom_lens %>% filter(chrom != 23),
aes(xmin = 0, xmax = length, ymin = -2, ymax = -0.5),
color = 'black',
fill = 'white',
size = 1
) +
geom_rect(
data = cytoband %>% filter(chrom != 23),
inherit.aes = F,
aes(xmin = start, xmax = end, ymin = -2, ymax = -0.5, fill = stain),
size = 1,
show.legend = FALSE
) +
geom_segment(
lineend = 'round',
size = 0.75
) +
scale_fill_manual(values = cyto_colors) +
theme_void() +
theme(
axis.text = element_blank(),
axis.ticks = element_blank(),
axis.title = element_blank(),
legend.position = 'right',
panel.spacing.y = unit(0.5, "lines"),
plot.margin = unit(c(0,0,0,0), "lines"),
panel.border = element_blank(),
strip.text.y = element_text(size = 14, hjust = 0.5, angle = 0),
strip.background.y = element_rect(size = 1, fill = NA),
strip.background.x = element_blank()
) +
scale_y_discrete(expand = expansion(add = 1)) +
scale_x_discrete(expand = c(0.02,0)) +
facet_grid(study~chrom, space = 'free', scales = 'free', switch="both") +
xlab('') +
ylab('') +
scale_color_manual(values = cnv_pal2, name = '')
do_plot(p_segs, 'comparison_genomewide_MS_R2_10p.pdf', w = 16, h = 12)
key = fread('/ifs/dmprequest/12-245/key.txt')[,1:2]
colnames(key) = c('dmp_sample_id', 'bam_name')
key = key %>% mutate(bam_dir = as.character(glue('/ifs/dmpshare/share/irb12_245/{bam_name}.bam')))
key = key %>% mutate(dmp_patient_id = substr(dmp_sample_id, 1, 9))
key = key %>% mutate(bait = substr(dmp_sample_id, nchar(dmp_sample_id) - 2, nchar(dmp_sample_id)))
# M_wide = M_wide %>% left_join(key %>% select(tumor_bam_dir = bam_dir, dmp_tumor_id = dmp_sample_id))
bam_replicates = key %>%
filter(str_detect(dmp_sample_id, 'IM')) %>%
filter(str_detect(dmp_sample_id, '-N')) %>%
group_by(dmp_patient_id) %>%
filter(n()>1) %>%
ungroup()
sample_replicates = fread('./data/replicate_samples_parsed.txt')
sample_replicates = sample_replicates %>%
filter(QC == 'Passed' & ExemplarSampleType == 'Blood') %>%
mutate(DateofProcedureRep = as.Date(Date_of_Procedure, format = '%m/%d/%Y')) %>%
rename(rep_sample_id = dmp_sample_id) %>%
inner_join(M_wide %>% select(dmp_sample_id, dmp_patient_id, DateofProcedure)) %>%
mutate(rep_type = ifelse(DateofProcedureRep == DateofProcedure, 'replicate', 'serial')) %>%
filter(!is.na(DateofProcedureRep))
nrow(sample_replicates) %>% paste('total replicates')
count(sample_replicates, rep_type)
segs_replicates = fread('./data/segs_replicates.tsv', sep = '\t')
failed_samples = segs_replicates %>% filter(is.na(seg)) %>% pull(dmp_sample_id)
cnv_reps = segs_filtered %>% filter(dmp_patient_id %in% sample_replicates$dmp_patient_id)
cnv_reps = cnv_reps %>%
# left_join(M_wide %>% select(dmp_sample_id, DateofProcedure), by = 'dmp_sample_id') %>%
left_join(
sample_replicates %>% select(rep_sample_id = dmp_sample_id, DateofProcedureRep, dmp_patient_id),
by = 'dmp_patient_id')
cnv_reps = cnv_reps %>% filter(!(dmp_sample_id %in% failed_samples | rep_sample_id %in% failed_samples))
cnv_reps = cnv_reps %>% mutate(rep_type = ifelse(DateofProcedureRep == DateofProcedure, 'replicate', 'serial'))
# collect the segments
outdir = '/work/isabl/home/gaot/ch_cnv/results'
segs_reps = c(cnv_reps$dmp_sample_id, cnv_reps$rep_sample_id) %>%
lapply(function(sid){
segfile = glue('{outdir}/{sid}_seg.tsv')
if (file.exists(segfile)) {
seg = fread(segfile)
seg %>% mutate(dmp_sample_id = sid) %>% filter(aberrant)
} else {
data.frame(dmp_sample_id = c(sid)) %>% mutate(dmp_sample_id = as.character(dmp_sample_id))
}
}) %>% Reduce(bind_rows, .)
segs_reps = segs_reps %>% left_join(bam_replicates, by = "dmp_sample_id")
segs_reps = segs_reps %>% mutate(
z_x = ifelse(is.na(z_x), t_cnlr, z_x),
z_y = ifelse(is.na(z_y), t_valor, z_y),
p_x = ifelse(is.na(p_x), p_cnlr, p_x),
p_y = ifelse(is.na(p_y), p_valor, p_y)
)
segs_reps = segs_reps %>%
group_by(dmp_patient_id) %>%
mutate(timepoint = 1:n()) %>%
mutate(timepoint = as.character(timepoint)) %>%
ungroup()
segs_reps = segs_reps %>% filter(type != 'err' & err < 0.06)
# segs_serial %>% fwrite('./data/segs_serial_may26.tsv', sep = '\t')
segs_reps = segs_reps %>%
left_join(M_wide %>% select(dmp_sample_id, DateofProcedure), by = 'dmp_sample_id') %>%
left_join(sample_replicates %>% select(dmp_sample_id, DateofProcedureRep) %>% distinct(), by = 'dmp_sample_id') %>%
mutate(DateofProcedure = if_else(is.na(DateofProcedureRep), DateofProcedure, DateofProcedureRep)) %>%
select(-DateofProcedureRep) %>%
arrange(dmp_patient_id, DateofProcedure, dmp_sample_id)
# segs_replicates = segs_replicates %>%
# mutate(dmp_patient_id = substr(dmp_sample_id, 1, 9)) %>%
# group_by(dmp_patient_id) %>%
# filter(!any(is.na(seg))) %>%
# ungroup() %>%
# left_join(M_wide %>% select(dmp_patient_id, Gender, Sex), by = 'dmp_patient_id')
# segs_replicates = segs_replicates %>% filter(dmp_patient_id %in% sample_replicates$dmp_patient_id)
# segs_replicates = segs_replicates %>% filter((p_chisq < 1e-10 & p_y < 1e-3)) %>%
# filter(type != 'err' & err < 0.06) %>%
# filter(!(phi >= 0.8 & type %in% c('amp'))) %>%
# filter(!(type == 'amp' & size < 24e6)) %>%
# filter(!(type == 'loh' & size < 8e6)) %>%
# filter(!(chrom == 23 & Sex == 'Male' & type == 'loh')) %>%
# filter(!(chrom == 23 & type == 'amp')) %>%
# filter(cnlr <= 0.5)
options(repr.plot.width = 4, repr.plot.height = 3, repr.plot.res = 300)
segs_reps %>%
mutate(label = ifelse(timepoint == '1', chrom, '')) %>%
ggplot(
aes(x = DateofProcedure, y = phi, group = dmp_patient_id, color = type, label = label, size = size)
) +
scale_color_manual(values = cnv_pal) +
scale_fill_manual(values = cnv_pal) +
geom_line(alpha = 0.4, size = 1) +
geom_point(pch = 21, alpha = 1) +
theme_classic() +
geom_text_repel(size = 2, force = 3) +
ylim(0,1) +
# scale_x_discrete(expand = c(0.1,0)) +
scale_x_date(date_breaks = "1 year", date_labels = "%Y")
options(repr.plot.width = 10, repr.plot.height = 2, repr.plot.res = 300)
source('/work/isabl/home/gaot/facets-ch/R/facets-ch-utils.R')
samples = sample_replicates %>%
filter(rep_type == 'replicate') %>%
filter(!(dmp_sample_id %in% failed_samples | rep_sample_id %in% failed_samples)) %>%
filter(dmp_patient_id %in% segs_reps$dmp_patient_id)
plots = list()
plots_rep = list()
for (i in 1:nrow(samples)) {
outdir = '/work/isabl/home/gaot/ch_cnv/results'
sid = samples[i,'dmp_sample_id']
rid = samples[i,'rep_sample_id']
plotfile = glue('{outdir}/{sid}.png')
seg = fread(glue('{outdir}/{sid}_seg.tsv'))
sig = fread(glue('{outdir}/{sid}_sig.tsv'))
seg = seg %>% mutate(aberrant = ifelse(cnlr > 0.5, F, aberrant))
p1 = facets_plot(sig, seg, snp_distance = F)
plots[[i]] = p1
plotfile = glue('{outdir}/{rid}.png')
seg_rep = fread(glue('{outdir}/{rid}_seg.tsv'))
sig_rep = fread(glue('{outdir}/{rid}_sig.tsv'))
seg_rep = seg_rep %>% mutate(aberrant = ifelse(cnlr > 0.5, F, aberrant))
p2 = facets_plot(sig_rep, seg_rep, snp_distance = F)
plots_rep[[i]] = p2
}
## EDITED MS - R2 scatter plot
df_scatter = data.frame()
for (i in 1:nrow(samples)) {
outdir = '/work/isabl/home/gaot/ch_cnv/results'
sid = as.character(samples[i,'dmp_sample_id'] )
rid = as.character(samples[i,'rep_sample_id'] )
# plotfile = glue('{outdir}/{sid}.png')
seg = fread(glue('{outdir}/{sid}_seg.tsv'))
# sig = fread(glue('{outdir}/{sid}_sig.tsv'))
seg = seg %>% mutate(aberrant = ifelse(cnlr > 0.5, F, aberrant)) %>%
filter(aberrant ) %>% select(phi_rep1 = phi, chrom, type)
# p1 = facets_plot(sig, seg, snp_distance = F)
# plots[[i]] = p1
# plotfile = glue('{outdir}/{rid}.png')
seg_rep = fread(glue('{outdir}/{rid}_seg.tsv'))
# sig_rep = fread(glue('{outdir}/{rid}_sig.tsv'))
seg_rep = seg_rep %>% mutate(aberrant = ifelse(cnlr > 0.5, F, aberrant)) %>%
filter(aberrant) %>% select(phi_rep2 = phi )
df_scatter = rbind(df_scatter, cbind(seg, seg_rep))
}
### EDITED MS SCATTER REVIEWER 2
df_scatter = df_scatter %>%
mutate(type2 = factor(c('del' = 'DEL', 'loh' = 'CNLOH', 'amp' = 'AMP')[type], c('DEL', 'CNLOH', 'AMP')))
p_scatter = ggplot(df_scatter, aes( x = phi_rep1, y = phi_rep2, label = chrom, color = type2)) + geom_abline(linetype = "dashed") + geom_point() +
geom_text_repel( ) +
ylim(0,1) + xlim(0,1) + theme_classic() +
scale_color_manual(values = cnv_pal2) + labs(color = " ") + xlab("Cell Fraction 1") + ylab("Cell Fraction 2")
do_plot(p_scatter, "R2_scatter_phi.pdf", h = 3, w = 4)
#CNLOH'darkgreen'AMP'darkblue'DEL'darkred'
options(warn = -1)
wrap_plots(plots, ncol = 1) %>% do_plot('rep_sample.pdf', w = 10, h = 12)
wrap_plots(plots_rep, ncol = 1) %>% do_plot('rep_rep.pdf', w = 10, h = 12)
options(warn = 0)
D = M_long %>%
filter(ch_nonsilent == 1) %>%
rowwise() %>%
mutate(
cis_cnv = any(chrom == segs_filtered$chrom & dmp_patient_id == segs_filtered$dmp_patient_id &
start >= segs_filtered$start - window & start <= segs_filtered$end + window)
) %>%
ungroup() %>%
filter(cis_cnv == 0)
p = ggplot(
D,
aes(x = VAF_N * 2)
) +
geom_density(
trim = TRUE,
alpha = 0.5,
fill = sm_col,
size = 0.5
) +
geom_density(
trim = TRUE,
inherit.aes = F,
data = segs_filtered,
mapping = aes(x = phi),
fill = cnv_col,
alpha = 0.5,
size = 0.5
) +
xlim(0,1) +
theme_classic() +
xlab('Estimated cell fraction')
do_plot(p, 'sensitivity.pdf', w = 2, h = 1.9)
M_long %>% filter(ch_nonsilent == 1) %>% dim
M_long %>% filter(ch_nonsilent == 1) %>% pull(VAF_N) %>% median
M_long %>% filter(ch_nonsilent == 1) %>% pull(VAF_N) %>% range
D %>% pull(VAF_N) %>% {.*2} %>% range
M_wide %>% count(ch_nonsilent) %>% mutate(prop = n/sum(n))
font_size = 8
M_long_nonsilent = M_long %>%
filter(!VariantClass %in% c('Silent', 'Intron')) %>%
mutate(
VariantClass3 = case_when(
VariantClass == 'Frame_Shift_Del' ~ 'Frameshift indel',
VariantClass == 'Frame_Shift_Ins' ~ 'Frameshift indel',
VariantClass == 'In_Frame_Del' ~ 'Inframe indel',
VariantClass == 'In_Frame_Ins' ~ 'Inframe indel',
VariantClass == 'Splice_Site' ~ 'Splice or other',
VariantClass == 'Splice_Region' ~ 'Splice or other',
VariantClass == "5'Flank" ~ 'Splice or other',
VariantClass == 'Translation_Start_Site' ~ 'Splice or other',
VariantClass == 'Nonstop_Mutation' ~ 'Splice or other',
VariantClass == 'Missense_Mutation' ~ 'Missense',
VariantClass == 'Nonsense_Mutation' ~ 'Nonsense',
T ~ VariantClass)
) %>%
mutate(VariantClass3 = factor(VariantClass3))
panel_theme = theme(
legend.direction = 'vertical',
legend.position="top",
legend.text = element_text(size=12),
plot.margin = margin(t = 0, r = 0, b = 0, l = 0),
axis.text = element_text(size = font_size)
)
# All patients
gene_list = M_long_nonsilent %>% count(Gene) %>% arrange(-n) %>% .$Gene %>% unique %>% .[1:30]
p_all = ggplot(
M_long_nonsilent %>% filter(Gene %in% gene_list) %>%
mutate(Gene = as.integer(factor(Gene, gene_list))) %>%
count(Gene, VariantClass3),
aes(x = Gene, y = n, fill = VariantClass3)
) +
geom_bar(stat = 'identity', color = 'black', size = 0.2, width = 0.9) +
ylab("Number of Mutations") +
theme_classic() +
theme(
panel.grid.major = element_blank(),
panel.border = element_blank(),
axis.line = element_line(colour = "white"),
legend.title = element_blank()
) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = 'right',
strip.background = element_rect(fill = NA, size = 1, color = 'gray')
) +
facet_zoom(xlim = c(5.5 + 0.1, 29.5 - 0.1), zoom.size = 1, ylim = c(0,500), horizontal = F, show.area = F) +
scale_x_continuous(labels = gene_list, breaks = 1:length(gene_list), expand = expansion(add=0.1)) +
scale_fill_discrete(drop = F)
# CNV subset
gene_list = M_long_nonsilent %>%
filter(!VariantClass %in% c('Silent', 'Intron')) %>%
filter(dmp_patient_id %in% (M_wide %>% filter(ch_cnv == 1) %>% pull(dmp_patient_id))) %>%
count(Gene) %>% arrange(-n) %>% .$Gene %>% unique %>% .[1:30]
p_cnv = ggplot(
M_long_nonsilent %>%
filter(Gene %in% gene_list) %>%
filter(dmp_patient_id %in% (M_wide %>% filter(ch_cnv == 1) %>% pull(dmp_patient_id))) %>%
mutate(Gene = as.integer(factor(Gene, gene_list))) %>%
count(Gene, VariantClass3),
aes(x = Gene, y = n, fill = VariantClass3)
) +
geom_bar(stat = 'identity', color = 'black', size = 0.2) +
ylab("Number of Mutations") +
theme_classic() +
theme(
panel.grid.major = element_blank(),
panel.border = element_blank(),
axis.line = element_line(colour = "white"),
legend.title = element_blank()
) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = 'right',
strip.background = element_rect(fill = NA, size = 1, color = 'gray')
) +
facet_zoom(xlim = c(5.5 + 0.1, 29.5 - 0.1), zoom.size = 1, ylim = c(0,15), horizontal = F, show.area = F) +
scale_x_continuous(labels = gene_list, breaks = 1:length(gene_list), expand = expansion(add=0.1)) +
scale_fill_discrete(drop = F)
do_plot(p_all, 'sm_bar_all.pdf', w = 8, h = 4)
do_plot(p_cnv, 'sm_bar_ai.pdf', w = 8, h = 4)
D = M_wide %>%
filter(ch_nonsilent == 1) %>%
mutate(mutnum_nonsilent = case_when(
mutnum_nonsilent < 5 ~ as.character(mutnum_nonsilent), T ~ '5+')
) %>%
count(mutnum_nonsilent) %>%
mutate(prop = n/sum(n))
display(D)
p_n_all = ggplot(
D,
aes(x = mutnum_nonsilent, y = n)
) +
theme_classic() +
geom_bar(stat = 'identity', position = 'stack') +
scale_fill_manual(values = cnv_pal) +
theme(
legend.position = 'none',
axis.title = element_text(size = 8)
) +
xlab('Events per patient') +
ylab('Number of patients')
# scale_y_continuous(expand = expansion(add = 0))
D = M_wide %>%
filter(ch_nonsilent == 1) %>%
filter(ch_cnv == 1) %>%
mutate(mutnum_nonsilent = case_when(
mutnum_nonsilent < 5 ~ as.character(mutnum_nonsilent), T ~ '5+')
) %>%
count(mutnum_nonsilent)
p_n_ai = ggplot(
D,
aes(x = mutnum_nonsilent, y = n)
) +
theme_classic() +
geom_bar(stat = 'identity', position = 'stack') +
scale_fill_manual(values = cnv_pal) +
theme(
legend.position = 'none',
axis.title = element_text(size = 8)
) +
xlab('Events per patient') +
ylab('Number of patients')
# scale_y_continuous(expand = expansion(add = 0))
(p_n_all / p_n_ai) %>%
do_plot('sm_num.pdf', w = 2, h = 3.7)
options(repr.plot.width = 5, repr.plot.height = 2, repr.plot.res = 300)
D = M_wide %>%
mutate(age_bin = cut(age, c(0, seq(20, 80, 10), 100))) %>%
group_by(age_bin) %>%
summarise(
`GM VAF>2%` = sum(ch_nonsilent & VAF_nonsilent > 0.02)/n(),
`GM VAF>5%` = sum(ch_nonsilent & VAF_nonsilent > 0.05)/n(),
`GM VAF>10%` = sum(ch_nonsilent & VAF_nonsilent > 0.1)/n(),
n = n()
) %>%
melt(measure.vars = c('GM VAF>2%', 'GM VAF>5%', 'GM VAF>10%'), variable.name = 'threshold', value.name = 'prop') %>%
rbind(
M_wide %>%
mutate(age_bin = cut(age, c(0, seq(20, 80, 10), 100))) %>%
group_by(age_bin) %>%
summarise(
prop = sum(ch_cnv)/n(),
n = n()
) %>%
mutate(threshold = 'mCA')
)
p_age = ggplot(
D,
aes(x = age_bin, y = prop, group = threshold, color = threshold)
) +
geom_line() +
theme_classic() +
geom_point() +
ylab('Proportion of subjects') +
xlab('Age') +
labs(color = '')
p_age %>%
do_plot('sm_age.pdf', w = 5, h = 2)
tmns_sequenced = fread('/work/isabl/home/gaot/ch_cnv/data/sequenced_tmns_processed.tsv', sep = '\t')
tmns_sequenced = tmns_sequenced %>% mutate(Gender = substr(Gender, 1,1)) %>%
mutate(isabl_exp_id = `Experiment System ID`)
tmns_sequenced = tmns_sequenced %>% inner_join(M_wide %>% select(
dmp_patient_id, dmp_sample_id, valid_diagnosis, seq_to_diag, post_tmn, post_anyblood, heme_cat, ch_cnv, ch_nonsilent
), by = "dmp_patient_id")
tmns_sequenced = tmns_sequenced %>% filter(valid_diagnosis == 1)
tmns_sequenced = tmns_sequenced %>% filter(heme_cat %in% c('AML', 'MDS'))
tmns_sequenced %>% count(heme_cat)
tmns_sequenced %>% count(bait)
tmns_sequenced %>% count(ch_cnv)
tmns_sequenced %>% count(ch_nonsilent)
# tmns_sequenced %>% fwrite('./data/paired_tmn.tsv', sep = '\t')
segs_y_serial = fread('./data/y_segs_serial.tsv', sep = '\t')
segs_y_serial = segs_y_serial %>%
group_by(dmp_patient_id) %>%
mutate(timepoint = 1:n()) %>%
mutate(timepoint = as.character(timepoint)) %>%
ungroup()
options(repr.plot.width = 4, repr.plot.height = 2.5, repr.plot.res = 300)
segs_y_serial %>%
ggplot(
aes(x = timepoint, y = cnlr, group = dmp_patient_id)
) +
geom_line() +
geom_point(pch = 21, alpha = 0.4) +
theme_classic() +
scale_x_discrete(expand = c(0.1,0))
library(VennDiagram)
library(RColorBrewer)
futile.logger::flog.threshold(futile.logger::ERROR, name = "VennDiagramLogger")
venn.diagram(
x = list(M_wide[M_wide$ch_cnv == 1,][['dmp_patient_id']], M_wide[M_wide$ch_nonsilent == 1,][['dmp_patient_id']]),
category.names = c('mCA', 'GM'),
# output features
filename = './figures/venn.tiff',
imagetype="tiff",
height = 800,
width = 800,
# resolution = 300,
compression = "lzw",
output = TRUE,
margin = 0.1,
# circles
lwd = 1,
fill = c(cnv_col, sm_col),
# Numbers
cex = 0.6,
fontface = "bold",
fontfamily = "sans",
ext.dist = 0.06,
ext.line.lwd = 0.8,
# Set names
cat.cex = 0.55,
cat.fontface = "bold",
cat.default.pos = "outer",
cat.fontfamily = "sans",
cat.dist = 0.1
)
display(table(M_wide$ch_cnv, M_wide$ch_nonsilent))
fisher.test(table(M_wide$ch_cnv, M_wide$ch_nonsilent))
glm(
formula = ch_cnv ~ ch_nonsilent + age + age_sq,
data = M_wide %>% mutate(age_sq = age^2)
) %>%
sjPlot::get_model_data(type = 'est')
genes = M_long %>% count(Gene) %>% filter(n >= 30) %>% pull(Gene)
res = data.frame()
for (gene in genes) {
test = fisher.test(table(M_wide[[gene]], M_wide[['ch_cnv']]))
res = rbind(
res,
data.frame(
'p.value' = test$p.value,
'Gene' = gene,
'OR' = test$estimate
)
)
}
res = res %>% mutate(q = p.adjust(p.value, n = length(genes), method = 'BH'))
length(genes)
p_volcano = ggplot(
res %>% mutate(label = ifelse(q < 0.05, as.character(Gene), NA)),
aes(x = OR, y = -log(q), color = q < 0.05, label = label)
) +
geom_hline(yintercept = -log(0.05), linetype = 'dashed') +
geom_point() +
geom_text_repel(size = 2.5, seed = 3) +
theme_classic() +
scale_color_manual(values = c('gray', 'darkred')) +
theme(
legend.position = 'none'
)
n_cnv = sum(M_wide$ch_cnv)
n_normal = nrow(M_wide) - sum(M_wide$ch_cnv)
genes = M_long %>% count(Gene) %>% arrange(-n) %>% filter(n > 30) %>% pull(Gene)
D = M_long %>%
filter(ch_nonsilent == 1) %>%
count(Gene, ch_cnv) %>%
tidyr::complete(Gene, ch_cnv, fill = list(n = 0)) %>%
mutate(n_total = ifelse(ch_cnv == 1, n_cnv, n_normal)) %>%
group_by(ch_cnv) %>%
mutate(prop = n/n_total) %>%
left_join(
res,
by = 'Gene'
) %>%
filter(q < 0.05) %>%
arrange(desc(n)) %>%
mutate(Gene = factor(Gene, Gene))
p_genes = ggplot(
D %>% mutate(ch_cnv = ifelse(ch_cnv == 1, 'mCA+', 'mCA-')),
aes(x = Gene, y = prop, fill = ch_cnv)
) +
theme_classic() +
geom_col(
position = position_dodge(),
width = 0.75,
size = 0.2,
color = 'black'
) +
scale_y_continuous(expand = expansion(add = 0.001), labels = abs) +
scale_fill_manual(values = c(cnv_col, 'gainsboro')) +
theme(
axis.text.x = element_text(angle = 30, hjust = 1, size = 7),
legend.title = element_blank(),
legend.position = 'right'
) +
ylab('Proportion of patients') +
xlab('')
panel = (p_volcano | p_genes) + plot_layout(widths = c(1,3))
do_plot(panel, 'sm_enrichment.pdf', w = 7.5, h = 2.2)
D = M_wide %>%
mutate(mutnum_bin = ifelse(mutnum_nonsilent >= 4, '4+', as.character(mutnum_nonsilent))) %>%
group_by(mutnum_bin) %>%
summarise(
n_cnv = sum(ch_cnv),
total = n(),
prop = n_cnv/total,
`.groups` = 'drop'
) %>%
mutate(
prop_lower = qbinom(p = 0.05, size = total, prob = prop)/total,
prop_upper = qbinom(p = 0.95, size = total, prob = prop)/total,
)
p_mutnum = ggplot(
D,
aes(x = mutnum_bin, y = prop, group = '', ymin = prop_lower, ymax = prop_upper)
) +
geom_ribbon(fill = 'royalblue', alpha = 0.3) +
geom_line(color = 'black') +
geom_point(color = 'black') +
theme_classic() +
ylab('Prop. mCA') +
xlab('Number of mutations') +
scale_y_continuous(expand = expansion(mult = 0.05), limits = c(0,NA))
display(D)
D = M_wide %>%
mutate(VAF_bin = case_when(
VAF_trans == 0 ~ '<2%',
VAF_trans < 0.05 ~ '2-5%',
VAF_trans < 0.1 ~ '5-10%',
VAF_trans < 0.2 ~ '10-20%',
T ~ '>20%'
)) %>%
arrange(VAF_trans) %>%
mutate(VAF_bin = factor(VAF_bin, unique(VAF_bin))) %>%
group_by(VAF_bin) %>%
summarise(
n_cnv = sum(ch_cnv),
total = n(),
prop = n_cnv/total,
`.groups` = 'drop'
) %>%
mutate(
prop_lower = qbinom(p = 0.05, size = total, prob = prop)/total,
prop_upper = qbinom(p = 0.95, size = total, prob = prop)/total,
)
display(D)
p_vaf = ggplot(
D,
aes(x = VAF_bin, y = prop, ymin = prop_lower, ymax = prop_upper, group = '')
) +
geom_ribbon(fill = 'royalblue', alpha = 0.3) +
geom_line(color = 'black') +
geom_point(color = 'black') +
theme_classic() +
ylab('Prop. mCA') +
xlab('Max VAF') +
scale_y_continuous(expand = expansion(mult = 0.05), limits = c(0,NA)) +
theme(axis.text.x = element_text(angle = 30, hjust = 1))
panel = p_mutnum | p_vaf
do_plot(panel, 'mutnum_vaf.pdf', w = 4, h = 2)
D = M_wide %>%
mutate(mutnum_bin = ifelse(mutnum_nonsilent >= 3, '3+', as.character(mutnum_nonsilent))) %>%
group_by(mutnum_bin) %>%
summarise(
n_cnv = sum(ch_cnv),
total = n(),
prop = n_cnv/total,
`.groups` = 'drop'
) %>%
mutate(
prop_lower = qbinom(p = 0.05, size = total, prob = prop)/total,
prop_upper = qbinom(p = 0.95, size = total, prob = prop)/total,
)
D
sum(M_wide$ch_cnv)/nrow(M_wide)
M_wide %>%
mutate(mutnum_bin = ifelse(mutnum_trans >= 4, '4+', as.character(mutnum_trans))) %>%
filter(smoke != 'missing') %>%
filter(mutnum_bin != '0') %>%
glm(
ch_cnv_auto ~ age10 + mutnum_trans + VAF_trans,
data = .,
family = 'binomial'
) %>%
sjPlot::get_model_data(type = 'est') %>%
mutate(feature = 'mutnum')
options(repr.plot.width = 6, repr.plot.height = 1.5, repr.plot.res = 300)
M_wide %>%
mutate_at(
c('ch_my_pd', 'ch_cnv', 'ch_all'),
as.character
) %>%
mutate(ch_status = case_when(
ch_my_pd == 1 ~ 'ch_my_pd',
ch_all == 1 ~ 'ch_other',
T ~ 'none'
)) %>%
mutate(ch_status = factor(ch_status, rev(unique(ch_status)))) %>%
count(ch_status, ch_cnv) %>%
group_by(ch_cnv) %>%
mutate(prop = n/sum(n)) %>%
ungroup() %>%
ggplot(
aes(x = ch_cnv, y = prop, fill = ch_status, label = n)
) +
geom_col() +
theme_classic() +
coord_flip() +
geom_text(position = position_stack(vjust = 0.5), color = 'white') +
scale_y_continuous(expand = c(0,0))
# use stacked bar instead
options(repr.plot.width = 7, repr.plot.height = 3, repr.plot.res = 300)
germline_cause = c('11=', '1=', '15+', '23-', '14=', '16=', '8=', '10-', '15-', '15=')
# https://www.nejm.org/doi/full/10.1056/NEJMoa1516192
aml_cnvs = c('7-', '8+', '5-', '17-', '9-', '12-', '21+')
# https://www.nature.com/articles/nature14666
cll_cnvs = c('12+', '13-', '11-', '17-', '6-')
# elsa's data
mds_cnvs = c('5-', '8+', '7-', '20-', '11-', '17-', '17=', '4=', '13-', '21+')
#
mpn_cnvs = c('9=', '20-', '1=', '17=', '7=', '14=')
D = segs_filtered %>%
mutate_at(
c('ch_sm'),
as.character
) %>%
mutate(mutnum = ifelse(mutnum_nonsilent > 2, '3+', as.character(mutnum_nonsilent))) %>%
mutate(germline_cause = cnv_label %in% germline_cause) %>%
group_by(cnv_label) %>%
mutate(
prop_sm = sum(ch_nonsilent)/n(),
mutnum_median = mean(mutnum_nonsilent),
total = n()
) %>%
filter(total >= 4) %>%
ungroup() %>%
arrange(mutnum_median) %>%
mutate(cnv_label = factor(cnv_label, unique(cnv_label))) %>%
count(mutnum, cnv_label) %>%
group_by(cnv_label) %>%
mutate(
total = sum(n),
prop = n/total
) %>%
ungroup() %>%
arrange(cnv_label) %>%
mutate(
category = case_when(
cnv_label %in% germline_cause ~ 'germline',
cnv_label %in% c(aml_cnvs, cll_cnvs) ~ 'CLL/AML',
T ~ 'none')
)
ggplot(
D,
aes(x = cnv_label, y = prop, fill = mutnum, label = total)
) +
theme_classic() +
geom_col(size = 0) +
scale_y_continuous(expand = c(0,0), limits = c(0, 1.05)) +
theme(
axis.text.x = element_text(
angle = 30, hjust = 0.5, size = 9,
# color = case_when(
# unique(D$cnv_label) %in% germline_cause ~ 'blue',
# unique(D$cnv_label) %in% c(aml_cnvs, cll_cnvs) ~ 'red',
# T ~ 'black'),
face = 'bold'),
legend.box = "horizontal",
legend.position = 'top'
) +
# guides(
# colour = guide_legend(override.aes = list(size = 1, fill = NA)),
# fill = guide_legend(override.aes = list(size = 2))
# ) +
# annotate('text', label = D$total, x = D$cnv_label, y = 1, vjust = -0.1, size = 2) +
xlab('') +
ylab('proportion') +
scale_fill_brewer(palette = 'Reds', name = '#GM')
# scale_color_manual(values = c('germline' = 'blue', 'CLL/AML' = 'red', 'none' = 'black'), name = 'etiology')
summ_nonsilent_imputed = M_wide %>%
filter(!therapy_detailed & smoke != 'missing') %>%
ungroup() %>%
glm(
ch_nonsilent ~ Gender + race_b + age10 + smoke + eqd_3_t + pct_cytotoxic_therapy,
data = .,
family = 'binomial'
) %>%
sjPlot::get_model_data(type = 'est')
summ_nonsilent_detailed = M_wide %>%
filter(therapy_detailed & smoke != 'missing') %>%
ungroup() %>%
glm(
ch_nonsilent ~ Gender + race_b + age10 + smoke + eqd_3_t + pct_cytotoxic_therapy,
data = .,
family = 'binomial'
) %>%
sjPlot::get_model_data(type = 'est')
class_c = fread('./data/clinicalreview/chemoclass_jan2019_revised.csv') %>% pull(drugclass_c) %>% unique
options(repr.plot.width = 4, repr.plot.height = 2.5, repr.plot.res = 500)
options(warn=-1)
summ_cnv = M_wide %>%
filter(smoke != 'missing') %>%
mutate(smoke = as.integer(smoke)) %>%
glm(
ch_cnv_auto ~ Gender + race_b + age10 + smoke,
data = .,
family = 'binomial'
) %>%
sjPlot::get_model_data(type = 'est')
summ_cnv_therapy = M_wide %>%
filter(therapy_detailed & smoke != 'missing') %>%
filter(DateofProcedure == DOP_kelly) %>%
group_by(tumor_type) %>% mutate(n_tumor_type = n()) %>% ungroup() %>%
mutate(tumor_type = ifelse(n_tumor_type < 100, 'Other', tumor_type)) %>%
mutate(
# eqd_3_t = as.character(eqd_3_t),
# pct_cytotoxic_therapy = as.character(pct_cytotoxic_therapy),
smoke = as.integer(smoke)
) %>%
glm(
ch_cnv_auto ~ Gender + race_b + age10 + smoke + XRT + ind_cytotoxic_therapy,
data = .,
family = 'binomial'
) %>%
sjPlot::get_model_data(type = 'est')
options(warn=0)
rbind(
summ_cnv %>% mutate(cohort = 'full'),
summ_cnv_therapy %>% mutate(cohort = 'therapy')
)
p_forest = rbind(
summ_cnv %>% mutate(cohort = paste0('Full (n=', nrow(M_wide[M_wide$smoke != 'missing',]), ')')),
summ_cnv_therapy %>% mutate(cohort = paste0('Therapy (n=', sum(M_wide[M_wide$smoke != 'missing',]$therapy_detailed), ')'))
) %>%
arrange(cohort) %>%
mutate(cohort = factor(cohort, rev(unique(cohort)))) %>%
filter(!str_detect(term, 'tumor_type')) %>%
mutate(term = factor(term, c('age10', 'GenderMale', 'race_b', 'smoke', 'XRT', 'ind_cytotoxic_therapy'))) %>%
arrange(term) %>%
mutate(
term = c('age10' = 'Age', 'GenderMale' = 'Male Gender', 'race_b' = 'White Race',
'smoke' = 'Ever Smoker', 'XRT' = 'External Beam Radiation', 'ind_cytotoxic_therapy' = 'Cytotoxic Therapy')[term]
) %>%
mutate(term = factor(term, rev(unique(term)))) %>%
plot_forest(
x = 'term',
label = 'p.label',
eb_w = 0,
eb_s = 1,
ps = 2,
or_s = 3,
nudge = -0.4,
OR = T,
col = 'cohort',
dodge_width = 0.7
) +
theme_classic() +
ylab('OR') +
xlab('') +
scale_color_discrete(name = 'Cohort')
do_plot(p_forest, 'forest.pdf', w = 6, h = 4)
top_genes = c('DNMT3A', 'TET2', 'ASXL1', 'EZH2', 'TP53', 'ATM', 'CHEK2', 'PPM1D',
'SRSF2', 'SF3B1', 'U2AF1', 'MPL', 'JAK2', 'TERT')
global_effects = data.frame()
for (gene in top_genes) {
D = M_wide %>%
filter(!is.na(get(gene))) %>%
mutate(multi_cnv_chrom = as.integer(n_cnv_chrom > 1)) %>%
filter(ch_cnv_auto == 1)
n_co = sum(D[[paste0(gene, '')]] == 1 & D[['multi_cnv_chrom']] == 1)
if (n_co >= 2) {
pval = fisher.test(table(D[[paste0(gene, '')]], D[['multi_cnv_chrom']]))$p.value
global_effects = rbind(
global_effects,
data.frame(
'Gene' = gene,
'intersect_count' = n_co,
'pval' = pval
)
)
}
}
global_effects = global_effects %>% mutate(q = p.adjust(pval, method = 'BH', n = length(top_genes)))
global_effects
source('/work/isabl/home/gaot/facets-ch/R/facets-ch-utils.R')
plots = list()
complex_samples = M_wide %>% filter(n_cnv_chrom >= 2 & TP53) %>% arrange(VAF_TP53) %>% pull(dmp_sample_id)
for (sid in (complex_samples)) {
outdir = '/work/isabl/home/gaot/ch_cnv/results'
seg = fread(glue('{outdir}/{sid}_seg.tsv'))
sig = fread(glue('{outdir}/{sid}_sig.tsv'))
plots[[sid]] = facets_plot(sig, seg, title = sid, snp_distance = F)
}
panel = wrap_plots(plots, ncol = 1)
options(warn = -1)
do_plot(panel, 'complex.pdf', w = 10, h = 8)
options(warn = 0)
cis_genes = c('DNMT3A', 'TET2', 'ASXL1', 'EZH2', 'TP53', 'ATM', 'CHEK2', 'PPM1D',
'SRSF2', 'SF3B1', 'U2AF1', 'MPL', 'JAK2', 'TERT')
cis_gene_bed = gene_bed %>% filter(Gene %in% cis_genes)
overlap_gene = function(seg_chrom, seg_start, seg_end, window = 5e6) {
res = cis_gene_bed %>%
mutate(overlap = chrom == seg_chrom & start > seg_start - window & end < seg_end + window) %>%
pull(overlap) %>% as.integer()
return(res)
}
cnv_overlaps = mapply(
overlap_gene,
segs_filtered$chrom, segs_filtered$start, segs_filtered$end
) %>%
t %>%
as.data.frame() %>%
setNames(paste0(cis_gene_bed$Gene, '_cnv'))
display('overlapped gene with CNV')
segs_filtered = cbind(
segs_filtered %>%
select_if((!colnames(.) %in% colnames(cnv_overlaps)) | colnames(.) == 'dmp_patient_id'),
cnv_overlaps)
segs_filtered = segs_filtered %>%
mutate_at(colnames(cnv_overlaps), list(amp = function(x){x * (.[['type']] == 'amp')})) %>%
mutate_at(colnames(cnv_overlaps), list(deloh = function(x){x * (.[['type']] %in% c('del', 'loh'))})) %>%
mutate_at(colnames(cnv_overlaps), list(del = function(x){x * (.[['type']] == 'del')})) %>%
mutate_at(colnames(cnv_overlaps), list(loh = function(x){x * (.[['type']] == 'loh')}))
GENE_CNV = segs_filtered %>%
group_by(dmp_patient_id) %>%
summarise_at(
str_subset(colnames(.), '_cnv'),
max
)
M_wide = M_wide %>%
select_if((!colnames(.) %in% colnames(GENE_CNV)) | colnames(.) == 'dmp_patient_id') %>%
left_join(
GENE_CNV,
by = "dmp_patient_id",
fill = 0
) %>%
mutate_at(colnames(GENE_CNV), function(x){ifelse(is.na(x), 0, x)})
display('merging with main dataframe')
GENEALL = M_long %>%
reshape2::dcast(
formula = dmp_patient_id ~ Gene,
value.var = 'Gene',
fun.aggregate = function(x) {min(1, length(x))}
) %>%
setNames(c('dmp_patient_id', paste0(colnames(.)[-1], '_all')))
M_wide = M_wide %>%
select_if((!colnames(.) %in% colnames(GENEALL)) | colnames(.) == 'dmp_patient_id') %>%
left_join(GENEALL, by = "dmp_patient_id") %>%
mutate_at(colnames(GENEALL), function(x){ifelse(is.na(x), 0, x)})
segs_filtered = segs_filtered %>%
select_if((!colnames(.) %in% colnames(GENEALL)) | colnames(.) == 'dmp_patient_id') %>%
left_join(GENEALL, by = "dmp_patient_id") %>%
mutate_at(colnames(GENEALL), function(x){ifelse(is.na(x), 0, x)})
cis_effects = data.frame()
for (gene in cis_genes) {
D = M_wide %>% filter(!is.na(get(gene)))
sm_count = sum(D[[paste0(gene, '')]])
cnv_count = sum(D[[paste0(gene, '_cnv')]])
del_count = sum(D[[paste0(gene, '_del')]])
loh_count = sum(D[[paste0(gene, '_loh')]])
intersect_cnv = sum(D[[paste0(gene, '')]] == 1 & D[[paste0(gene, '_cnv')]] == 1)
intersect_del = sum(D[[paste0(gene, '')]] == 1 & D[[paste0(gene, '_cnv_del')]] == 1)
intersect_loh = sum(D[[paste0(gene, '')]] == 1 & D[[paste0(gene, '_cnv_loh')]] == 1)
# pval_all = fisher.test(table(D[[paste0(gene, '')]], D[[paste0(gene, '_cnv')]]))$p.value
if (intersect_del >= 2) {
pval = fisher.test(table(D[[paste0(gene, '')]], D[[paste0(gene, '_cnv_del')]]))$p.value
cis_effects = rbind(
cis_effects,
data.frame(
'Gene' = gene,
'cnv_type' = 'del',
'sm_count' = sm_count,
'cnv_count' = del_count,
'intersect_count' = intersect_del,
'pval' = pval
)
)
}
if (intersect_loh >= 2) {
pval = fisher.test(table(D[[paste0(gene, '')]], D[[paste0(gene, '_cnv_loh')]]))$p.value
cis_effects = rbind(
cis_effects,
data.frame(
'Gene' = gene,
'cnv_type' = 'loh',
'sm_count' = sm_count,
'cnv_count' = del_count,
'intersect_count' = intersect_loh,
'pval' = pval
)
)
}
}
cis_effects = cis_effects %>% mutate(q = p.adjust(pval, method = 'BH', n = length(cis_genes) * 2))
cis_effects = cis_effects %>%
mutate(Gene = as.character(Gene)) %>%
left_join(gene_bed, by = 'Gene') %>%
arrange(chrom) %>% mutate(Gene = factor(Gene, unique(Gene)))
cis_effects
gene_cnvs = function(gene, qval) {
all_colors = c(
'amp' = 'darkblue', 'loh' = 'darkgreen', 'del' = 'darkred',
'Regulatory' = 'darkturquoise', 'Missense' = 'salmon', 'Truncating' = 'purple')
gene_chrom = gene_bed %>% filter(Gene == gene) %>% pull(chrom)
gene_structure = gene_structures %>% filter(str_detect(description, gene))
gene_locs = gene_bed %>%
filter(Gene == gene) %>%
as.data.frame(stringsAsFactors = F) %>%
mutate(loc = as.integer((start + end) / 2), chrom = as.integer(chrom))
this_chrom_lens = chrom_lens %>% filter(chrom == gene_chrom)
acen_chrom = acen %>% filter(chrom == gene_chrom)
D = segs_filtered %>%
filter(chrom == gene_chrom) %>%
mutate(chrom = factor(chrom)) %>%
group_by(dmp_sample_id, chrom) %>%
mutate(total_length = sum(size)) %>%
mutate(type = factor(type, c('amp', 'loh', 'del'))) %>%
arrange(desc(type), round(start/1e7), -total_length) %>%
group_by(chrom) %>%
mutate(index = as.integer(factor(dmp_sample_id, unique(dmp_sample_id)))) %>%
ungroup()
M_overlap = M_long %>%
filter(Gene == gene) %>%
filter(dmp_patient_id %in% D$dmp_patient_id) %>%
rowwise() %>%
filter(
any(chrom == D$chrom & dmp_patient_id == D$dmp_patient_id)
# start >= D$start - window & start <= D$end + window)
) %>%
left_join(
D %>% select(dmp_sample_id, index),
by = 'dmp_sample_id'
) %>%
ungroup() %>%
mutate(VariantClass = case_when(
VariantClass %in% c('Silent', 'Intron') ~ 'noncoding',
T ~ 'coding'
))
p_segs = ggplot(
D,
aes(x = start, xend = end, y = index, yend = index, color = type)
) +
geom_rect(
inherit.aes = F,
data = acen_chrom,
aes(xmin = cen_start, xmax = cen_end, ymin = -Inf, ymax = Inf),
fill = 'gray',
size = 0,
alpha = 0.3
) +
geom_segment(
size = 1,
alpha = 0.8,
lineend = 'round'
) +
geom_segment(
inherit.aes = F,
data = this_chrom_lens,
aes(x = 0, xend = length, y = 0, yend = 0),
color = 'black',
size = 0
) +
# label the mutations
# geom_point(
# inherit.aes = F,
# mapping = aes(x = start, y = index, color = VariantClass2),
# data = M_overlap,
# size = 6.5,
# pch = '*',
# color = 'white'
# ) +
geom_point(
inherit.aes = F,
mapping = aes(x = start, y = index, fill = VariantClass2),
data = M_overlap,
size = 1.5,
color = 'black',
pch = 21
) +
# geom_text_repel(
# inherit.aes = F,
# mapping = aes(x = start, y = index, color = VariantClass2, label = AAchange),
# data = M_overlap %>% mutate(zoom = TRUE),
# size = 1.5,
# segment.size = 0.2,
# color = 'black',
# ylim = c(0,NA)
# ) +
scale_alpha(
range = c(0, 1)
) +
theme_classic() +
theme(
axis.text = element_blank(),
axis.ticks = element_blank(),
axis.title = element_blank(),
legend.position = 'none',
panel.spacing.y = unit(1, "mm"),
plot.margin = unit(c(0.5,0.2,0.2,0.2), "lines"),
strip.text.y = element_blank(),
panel.border = element_rect(size = 0.2, fill = NA, color = 'gray'),
strip.background = element_rect(size = 0.5, fill = 'seashell3', color = 'seashell3'),
plot.title = element_text(size = 7)
) +
scale_y_discrete(expand = expansion(add = 1)) +
xlab('') +
ylab('') +
scale_color_manual(values = all_colors) +
scale_fill_manual(values = all_colors) +
ggtitle(paste0(gene, ', chr', gene_chrom)) +
# exon structures
geom_segment(
inherit.aes = F,
data = gene_structure %>% mutate(zoom = TRUE),
aes(x = min(start), xend = max(end), y = -1, yend = -1),
size = 0.1,
color = 'black'
) +
geom_segment(
inherit.aes = F,
data = gene_structure %>% mutate(zoom = TRUE),
aes(x = start, xend = end, y = -1, yend = -1),
size = 2,
color = 'black'
) +
facet_zoom(
zoom.size = 1,
zoom.data = zoom,
xlim = c(gene_locs$start, gene_locs$end)
)
p_segs
}
plots = list()
for (i in 1:nrow(cis_effects)) {
gene = as.character(cis_effects[i,'Gene'])
qval = cis_effects[i,'q']
pval = cis_effects[i,'pval']
plots[[gene]] = gene_cnvs(gene, pval)
}
p_cis = plot_grid(plotlist = plots, nrow = 2)
do_plot(p_cis, 'cis_loci.pdf', w = 6, h = 5)
cis_total = sum(cis_effects$intersect_count)
cis_total %>% paste0(' total cis events')
(cis_total/nrow(segs_filtered_auto)) %>% signif(3) %>% {.*100} %>% paste0('% of total events')
window = 5e6
segs_filtered_auto = segs_filtered %>% filter(chrom != 23)
M_overlap = M_long %>%
filter(dmp_patient_id %in% segs_filtered_auto$dmp_patient_id) %>%
rowwise() %>%
mutate(
locality = ifelse(
any(chrom == segs_filtered_auto$chrom & dmp_patient_id == segs_filtered_auto$dmp_patient_id &
start >= segs_filtered_auto$start - window & start <= segs_filtered_auto$end + window
),
'cis',
'trans'
)
) %>%
ungroup() %>%
mutate(variant_type = case_when(
VariantClass %in% c('Silent', 'Intron') ~ 'noncoding',
T ~ 'coding'
))
M_overlap_nonsilent = M_overlap %>% filter(variant_type == 'coding')
segs_filtered_auto %>%
rowwise() %>%
mutate(
cis_coding = as.integer(
any(chrom == M_overlap_nonsilent$chrom & dmp_patient_id == M_overlap_nonsilent$dmp_patient_id &
M_overlap_nonsilent$start >= start - window & M_overlap_nonsilent$start <= end + window
))
) %>%
group_by(type) %>%
summarise(
n = n(),
prop_cis = sum(cis_coding)/n
)
export CNV cases without GM for review
genes = c('ATM', 'TET2', 'EZH2', 'DNMT3A', 'JAK2', 'MPL', 'TP53')
M_wide %>%
filter(
(ATM_cnv | TET2_cnv | EZH2_cnv | DNMT3A_cnv | JAK2_cnv | MPL_cnv | TP53_cnv)
) %>%
setDT() %>%
data.table::melt(
measure.vars = list(
'CNV' = paste0(genes, '_cnv'),
'GM' = genes
),
variable.name = 'Gene',
variable.factor = FALSE
) %>%
mutate(
Gene = setNames(genes, 1:length(genes))[Gene]
) %>%
filter(CNV == 1 & GM != 1) %>%
select(dmp_sample_id, Gene, CNV, GM) #%>%
#fwrite('./data/sm_review_jun10.tsv', sep = '\t')
window = 1e7
M_overlap = M_long %>%
filter(dmp_patient_id %in% segs_filtered$dmp_patient_id) %>%
rowwise() %>%
filter(
any(chrom == segs_filtered$chrom & dmp_patient_id == segs_filtered$dmp_patient_id &
start >= segs_filtered$start - window & start <= segs_filtered$end + window)
) %>%
ungroup() %>%
mutate(VariantClass = case_when(
VariantClass %in% c('Silent', 'Intron') ~ 'noncoding',
str_detect(VariantClass, 'Del|Ins') ~ 'indel',
T ~ 'snv'
)) %>%
filter(VariantClass != 'noncoding') %>%
filter(Gene %in% c("EZH2", "ATM", "TET2", "JAK2", "MPL", 'DNMT3A', 'TP53')) %>%
arrange(desc(VAF_N)) %>%
distinct(dmp_patient_id, `.keep_all` = T)
p_vaf = segs_filtered %>%
left_join(
M_overlap %>% select(dmp_patient_id, chrom, sm_pos = start, Gene, VariantClass, VAF_N),
by = c('chrom', 'dmp_patient_id')
) %>%
filter(type != 'amp') %>%
filter(VAF_N > 0) %>%
mutate(
phi_sm = case_when(
type == 'del' ~ 2 * VAF_N/(1 + VAF_N),
type == 'loh' ~ VAF_N
)
) %>%
mutate(pigeonhole = phi_sm + phi > 1) %>%
ggplot(
aes(x = VAF_N, y = phi, color = type2, label = Gene)
) +
# stat_function(fun = function(v){2*v}, linetype = 'dashed', color = cnv_pal['err']) +
# stat_function(fun = function(v){2 * v/(1+v)}, linetype = 'dashed', color = cnv_pal['del'], alpha = 0.5) +
# geom_abline(linetype = 'dashed', color = cnv_pal['loh'], alpha = 0.5) +
stat_function(fun = function(v){2 * v}, linetype = 'dashed', color = cnv_pal['err']) +
stat_function(fun = function(v){2 * v/(1+v)}, linetype = 'dashed', color = cnv_pal['del'], alpha = 0.5) +
geom_abline(linetype = 'dashed', color = cnv_pal['loh'], alpha = 0.5) +
# annotate('text', x = 0.35, y = 1, label = 'trans', color = cnv_pal['err'], size = 3) +
# annotate('text', x = 0.75, y = 1, label = 'DEL', color = cnv_pal['del'], size = 3, alpha = 0.5) +
# annotate('text', x = 0.9, y = 0.7, label = 'CNLOH', color = cnv_pal['loh'], size = 3, alpha = 0.5) +
theme_classic() +
# stat_cor(label.y = 0.95, size = 3) +
scale_color_manual(values = cnv_pal2) +
geom_point(
aes(pch = pigeonhole),
# pch = 21
) +
scale_shape_manual(values = c('TRUE' = 19, 'FALSE' = 21)) +
xlim(0,1) +
ylim(0,1) +
geom_text(size = 1, vjust = -1, hjust = -0.2) +
facet_wrap(~type2, nrow = 1, scale = 'free') +
theme(
legend.position = 'none',
strip.background = element_blank()
) +
xlab('Gene mutation VAF') +
ylab('mCA cell fraction')
do_plot(p_vaf, 'cis_vaf.pdf', w = 4.6, h = 2.5)
double_hits_cyto = M_wide %>% filter(
(MPL & chr1 & VAF_MPL < 0.35) |
(DNMT3A & chr2 & VAF_DNMT3A < 0.35) |
(TET2 & chr4 & VAF_TET2 < 0.35) |
(EZH2 & chr7 & VAF_EZH2 < 0.35) |
(JAK2 & chr9 & VAF_JAK2 < 0.35) |
(ATM & chr11 & VAF_ATM < 0.35) |
(TP53 & chr17 & VAF_TP53 < 0.35)
) %>%
mutate(hits = case_when(
MPL & chr1 ~ 'MPL',
DNMT3A & chr2 ~ 'DNMT3A',
TET2 & chr4 ~ 'TET2',
EZH2 & chr7 ~ 'EZH2',
JAK2 & chr9 ~ 'JAK2',
ATM & chr11 ~ 'ATM',
TP53 & chr17 ~ 'TP53'
)) %>%
select(dmp_patient_id, dmp_sample_id, hits) %>%
arrange(hits)
# double_hits_cyto %>% fwrite('./data/double_hits_cyto.tsv', sep = '\t')
options(repr.plot.width = 6, repr.plot.height = 2.3, repr.plot.res = 300)
window = 1e7
M_overlap = M_long %>%
filter(dmp_patient_id %in% segs_filtered$dmp_patient_id) %>%
rowwise() %>%
filter(
!any(chrom == segs_filtered$chrom & dmp_patient_id == segs_filtered$dmp_patient_id &
start >= segs_filtered$start - window & start <= segs_filtered$end + window)
) %>%
ungroup() %>%
mutate(VariantClass = case_when(
VariantClass %in% c('Silent', 'Intron') ~ 'noncoding',
str_detect(VariantClass, 'Del|Ins') ~ 'indel',
T ~ 'snv'
)) %>%
filter(VariantClass != 'noncoding') %>%
arrange(desc(VAF_N)) %>%
distinct(dmp_patient_id, `.keep_all` = T)
p_vaf = segs_filtered %>%
arrange(desc(phi)) %>%
distinct(dmp_patient_id, `.keep_all` = T) %>%
left_join(
M_overlap %>% select(dmp_patient_id, chrom, sm_pos = start, Gene, VariantClass, VAF_N),
by = c('dmp_patient_id')
) %>%
mutate(pigeonhole = 2 * VAF_N + phi > 1) %>%
ggplot(
aes(x = VAF_N, y = phi, color = type2, label = Gene)
) +
stat_function(fun = function(v){2*v}, linetype = 'dashed', color = cnv_pal['err']) +
stat_function(fun = function(v){2 * v/(1+v)}, linetype = 'dashed', color = cnv_pal['del'], alpha = 0.5) +
geom_abline(linetype = 'dashed', color = cnv_pal['loh'], alpha = 0.5) +
# annotate('text', x = 0.35, y = 1, label = 'trans', color = cnv_pal['err'], size = 3) +
# annotate('text', x = 0.75, y = 1, label = 'cis-del', color = cnv_pal['del'], size = 3, alpha = 0.5) +
# annotate('text', x = 0.9, y = 0.7, label = 'cis-loh', color = cnv_pal['loh'], size = 3, alpha = 0.5) +
theme_classic() +
# geom_abline(linetype = 'dashed') +
scale_shape_manual(values = c('TRUE' = 19, 'FALSE' = 21)) +
# stat_cor(label.y = 0.98, size = 3) +
scale_color_manual(values = cnv_pal2) +
geom_point(
# pch = 21,
aes(pch = pigeonhole)
) +
xlim(0,1) +
ylim(0,1) +
# geom_text(size = 1, vjust = -1, hjust = -0.2) +
# scale_shape_manual(values = c(1,5)) +
facet_wrap(~type2, nrow = 1, scale = 'free') +
theme(
legend.position = 'none',
strip.background = element_blank()
) +
ylab('mCA cell fraction') +
xlab('Gene mutation VAF')
do_plot(p_vaf, 'trans_vaf.pdf', w = 6.8, h = 2.5)
M_wide %>% filter(nonparta == 0 & chr7_del == 1 & ch_nonsilent) %>% filter(dmp_patient_id %in% segs_filtered_rerun)
sid = 'P-0014033-N01-IM5'
outdir = '/work/isabl/public/facets-ch/res'
seg = fread(glue('{outdir}/{sid}_seg.tsv'))
sig = fread(glue('{outdir}/{sid}_sig.tsv'))
source('/work/isabl/home/gaot/facets-ch/R/facets-ch-utils.R')
seg_prime = call_aberrant(sig, get_seg(sig), p_threshold = 5e-5)
segs_filtered_rerun2 = segs_filtered_rerun %>% filter((p_chisq < 1e-10 & p_y < 1e-3))
M_wide %>% filter(dmp_patient_id == 'P-0018453') %>% pull(heme_cat)
options(repr.plot.width = 10, repr.plot.height = 2.5, repr.plot.res = 300)
samples = segs_filtered %>% filter(dmp_patient_id == 'P-0018453')
for (i in 1:nrow(samples)) {
outdir = '/work/isabl/home/gaot/ch_cnv/results'
sid = samples[i,'dmp_sample_id']
plotfile = glue('{outdir}/{sid}.png')
seg = fread(glue('{outdir}/{sid}_seg.tsv'))
display(seg %>% filter(aberrant) %>% select(phi, err, type, p_chisq, start, end))
display_png(file = plotfile)
}
give.median <- function(x){
return(c(x = median(x), label = median(x)))
}
D = M_wide %>%
filter(anc < 30 & alc < 8 & amc < 3 & plt < 1000 | is.na(anc) | is.na(alc) | is.na(amc) | is.na(plt)) %>%
mutate(group = ifelse(ch_cnv == 1, 'mCA+', 'mCA-')) %>%
mutate(group = factor(group, c('mCA+', 'mCA-'))) %>%
group_by(group) %>%
mutate(n = n()) %>%
ungroup() %>%
mutate(group_label = paste0(group, '\n(n=', n, ')')) %>%
mutate(group_label = factor(group_label, unique(group_label))) %>%
reshape2::melt(measure.vars = cbcs[cbcs != 'rbc']) %>%
mutate(variable = str_wrap(cbc_labels[variable], width = 3))
p_cbc = ggplot(
D,
aes(x = value, fill = group_label)
) +
geom_histogram(bins = 20, show.legend = F) +
# stat_summary(aes(y = 0), fun.data = give.median, geom = "text", vjust = 0, size = 2) +
facet_grid(group_label~variable, scale = 'free') +
scale_y_continuous(expand = c(0.2,0)) +
scale_x_continuous(expand = c(0.1,0)) +
geom_text(
inherit.aes = F,
data = D %>% group_by(group_label, variable) %>% summarise(median = median(na.omit(value))),
mapping = aes(y = 0, x = median, label = median),
size = 3,
vjust = 1
) +
theme_bw() +
theme(
panel.spacing = unit(1, "mm"),
legend.position = 'top',
legend.title = element_blank(),
axis.text.x = element_text(size = 8, angle = 30, hjust = 1),
strip.text = element_text(size = 6),
strip.background = element_blank()
) +
xlab('') +
ylab('Number of subjects')
do_plot(p_cbc, 'cbc_dist.pdf', w = 8, h = 2.5)
M_wide = M_wide %>%
mutate(
thrombocytopenia = plt < 100,
anemia = hgb < 10,
neutropenia = anc < 1.8,
cytopenia = thrombocytopenia | anemia | neutropenia
)
D = M_wide %>%
mutate(group = factor(ifelse(ch_cnv == 1, 'mCA+', 'mCA-'), c('mCA-', 'mCA+'))) %>%
# mutate(group = factor(group, c('none', 'sm', 'cnv', 'both'))) %>%
select(group, thrombocytopenia, anemia, neutropenia, cytopenia) %>%
filter(complete.cases(.)) %>%
group_by(group) %>%
mutate(n = n()) %>%
ungroup() %>%
mutate(group_label = paste0(group, '(n=', n, ')')) %>%
mutate(group_label = factor(group_label, unique(group_label))) %>%
reshape2::melt(measure.vars = c('thrombocytopenia', 'anemia', 'neutropenia', 'cytopenia')) %>%
mutate(variable = ifelse(variable == 'cytopenia', 'any cytopenia', as.character(variable))) %>%
mutate(variable = Hmisc::capitalize(variable)) %>%
mutate(variable = factor(variable, unique(variable))) %>%
group_by(group_label, group, variable) %>%
summarise(
total = n(),
prop = sum(value)/total,
lower = qbinom(p = 0.05, size = total, prob = prop)/total,
upper = qbinom(p = 0.95, size = total, prob = prop)/total,
)
p_cytopenia = ggplot(
D,
aes(x = group, y = prop, ymin = lower, ymax = upper, color = group_label)
) +
geom_errorbar(position = position_dodge(width = 0.3), width = 0.2) +
geom_point(position = position_dodge(width = 0.3)) +
theme_classic() +
facet_wrap(~variable, scale = 'free_x', nrow = 1) +
ylab('Proportion of subjects') +
xlab('') +
theme(
legend.title = element_blank(),
strip.text = element_text(size = 7)
# legend.position = 'none'
)
do_plot(p_cytopenia, 'cytopenia.pdf', w = 6.5, h = 2.2)
unique(M_wide$ch_cnv_auto)
unique(M_wide$ch_nonsilent_auto)
#### R1 Blood counts mCA +/- GM violin plots MS edited
M_wide_Bcounts = M_wide %>% mutate (category = ifelse( ch_cnv_auto & ch_nonsilent_auto, "mCA + GM", "Rest" )) %>%
filter(anc < 30 & alc < 8 & amc < 3 & plt < 1000 | is.na(anc) | is.na(alc) | is.na(amc) | is.na(plt)) %>%
group_by(category) %>%
mutate(n = n()) %>%
ungroup() %>%
mutate(category_label = paste0(category, '\n(n=', n, ')')) %>%
mutate(category_label = factor(category_label, unique(category_label))) %>%
reshape2::melt(measure.vars = cbcs, variable.name = 'bc', value.name = 'count')
p = ggplot(M_wide_Bcounts, aes(x = category, y = count, fill = category_label)) + geom_violin(fill = "lightgrey") + facet_wrap(~bc, scale = "free", nrow = 1) +
geom_boxplot(width=0.1, outlier.alpha = 0)+ theme_classic() + labs(fill = "")
do_plot(p, "R1_bloodcounts_mCA_vs_rest_n.pdf", w = 15, h = 2)
#### R1 Blood counts trans violin plots MS edited
# segs_filtered_auto = segs_filtered_auto %>% mutate(
# trans_effect =
# (chrom == 3 & type == 'amp' & TP53) |
# (chrom == 4 & type == 'del' & (!TET2) & (SRSF2 | ATM | CHEK2 | ASXL1)) |
# (chrom == 5 & type == 'del' & TP53) |
# (chrom == 7 & type == 'del' & (TP53 | PPM1D)) |
# (chrom == 8 & type == 'amp' & TET2) |
# (chrom == 12 & type == 'amp' & (NOTCH1 | MYD88 | FBXW7)) |
# (chrom == 13 & type == 'del' & ATM) |
# (chrom == 20 & type == 'del' & (U2AF1 | TERT))
# )
M_wide_trans = M_wide %>% mutate(
trans_effect =
(chr3_amp & TP53) |
(chr4_del & (!TET2) & (SRSF2 | ATM | CHEK2 | ASXL1)) |
(chr5_del & TP53) |
(chr7_del & (TP53 | PPM1D)) |
(chr8_amp & TET2) |
(chr12_amp & (NOTCH1 | MYD88 | FBXW7)) |
(chr13_del & ATM) |
(chr20_del & (U2AF1 | TERT))
) %>% mutate(
cis_effect =
(chr1_loh & MPL) |
(chr2_del & DNMT3A) |
((chr4_del |chr4_loh) & TET2) |
(chr7_loh & EZH2) |
(chr9_loh & JAK2) |
((chr11_loh | chr11_del) & ATM) |
(chr17 & TP53)
) %>% filter(anc < 30 & alc < 8 & amc < 3 & plt < 1000 | is.na(anc) | is.na(alc) | is.na(amc) | is.na(plt)) %>%
mutate(category = ifelse(trans_effect|cis_effect , "cis/trans", "Rest")) %>% group_by(category) %>%
mutate(n = n()) %>%
ungroup() %>%
mutate(category_label = paste0(category, '\n(n=', n, ')')) %>%
mutate(category_label = factor(category_label, unique(category_label))) %>%
reshape2::melt(measure.vars = cbcs, variable.name = 'bc', value.name = 'count')
# M_wide_trans %>% count(trans_effect | cis_effect)
p = ggplot(M_wide_trans, aes(x = category , y = count)) + geom_violin(fill = "lightgrey") + facet_wrap(~bc, scale = "free", nrow = 1) + geom_boxplot(width=0.1, outlier.alpha = 0)+ theme_classic()
do_plot(p, "R1_bloodcounts_cistrans_vs_rest.pdf", w = 15, h = 2)
p = ggplot(M_wide_trans, aes(x = category, y = count, fill = category_label)) + geom_violin(fill = "lightgrey") + facet_wrap(~bc, scale = "free", nrow = 1) +
geom_boxplot(width=0.1, outlier.alpha = 0)+ theme_classic() + labs(fill = "")
do_plot(p, "R1_bloodcounts_cistrans_vs_rest_n.pdf", w = 15, h = 2)
M_wide %>% count(heme_cat2)
M_wide %>% filter(alc>5 & heme_cat2 == "CLL") %>% count(ch_cnv)
segs_filtered %>% filter(alc>5 & heme_cat2 == "CLL") %>% count(chrom, dmp_sample_id, type)
#### R1 Blood counts heme category disease violin plots MS edited
M_wide_hemecat = M_wide %>% mutate( heme_cat3 = case_when(
heme_cat2 %in%c( "MDS", "MPN", "AML", "CML") ~ "Myeloid",
heme_cat2 %in% "CLL" ~ c("CLL"),
heme_cat2 %in% c("B-NHL", "T-NHL") ~ "Lymphoid" ,
T ~ "None"
)) %>% filter(anc < 30 & alc < 8 & amc < 3 & plt < 1000 | is.na(anc) | is.na(alc) | is.na(amc) | is.na(plt)) %>%
group_by(heme_cat3) %>%
mutate(n = n()) %>%
ungroup() %>%
mutate(category_label = paste0(heme_cat3, '\n(n=', n, ')')) %>%
mutate(category_label = factor(category_label, unique(category_label))) %>%
reshape2::melt(measure.vars = cbcs, variable.name = 'bc', value.name = 'count')
p = ggplot(M_wide_hemecat, aes(x = heme_cat3 , y = count)) + geom_violin(fill = "lightgrey") + facet_wrap(~bc, scale = "free", nrow = 1) +
geom_boxplot(width=0.1, outlier.alpha = 0)+ theme_classic()+ theme(axis.text.x = element_text(angle = 45, hjust=1))
do_plot(p, "R1_bloodcounts_heme_disease_1row.pdf", w = 20, h = 2)
p = ggplot(M_wide_hemecat, aes(x = heme_cat3, y = count, fill = category_label)) + geom_violin(fill = "lightgrey") + facet_wrap(~bc, scale = "free", nrow = 1) +
geom_boxplot(width=0.1, outlier.alpha = 0)+ theme_classic() + labs(fill = "") + theme(axis.text.x = element_text(angle = 45, hjust=1))
do_plot(p, "R1_bloodcounts_heme_disease_1row_n.pdf", w = 15, h = 2)
p = ggplot(M_wide_hemecat, aes(x = heme_cat3 , y = count)) + geom_violin(fill = "lightgrey") + facet_wrap(~bc, scale = "free", nrow = 2) +
geom_boxplot(width=0.1, outlier.alpha = 0)+ theme_classic()
do_plot(p, "R1_bloodcounts_heme_disease_2row.pdf", w = 15, h = 5)
colnames(M_wide)
M_wide %>% filter(ch_cnv == 1) %>% select(dmp_patient_id) # %>% fwrite('./data/cnv_cases.tsv', sep = '\t')
cbc_cnv = fread('./data/cbc_serial_cnv.tsv')
plot_cbc = function(cbc_serial) {
cbc_serial %>%
reshape2::melt(measure.vars = cbcs, variable.name = 'bc', value.name = 'count') %>%
mutate(
cytopenia = case_when(
bc == 'plt' ~ count < 100,
bc == 'hgb' ~ count < 10,
bc == 'anc' ~ count < 1.8,
T ~ F
)
) %>%
ggplot(
aes(x = DAYSFROMIMPACT, y = count, group = bc, color = cytopenia)
) +
geom_vline(xintercept = 0, linetype = 'dashed', color = 'gray') +
geom_line() +
geom_point(size = 1) +
theme_classic() +
facet_wrap(~bc, scale = 'free') +
theme(legend.position = 'none') +
scale_color_manual(values = c('royalblue', 'red'))
}
cbcs = c("anc", "alc", "amc", "hgb", "mcv", "rdw", "plt")
cnvs = colnames(CNV_detailed)[-1][(CNV_detailed[,-1] %>% colSums) >= 5]
cnvs = cnvs[cnvs != 'chr9_loh']
cnvs = c(cnvs, 'chr9_p_loh', 'chr9_q_loh')
cbcs = cbcs[cbcs != 'rbc']
summs = data.frame()
for (bc in paste0(cbcs[!cbcs %in% c('rbc', 'wbc')], '_n')) {
display(bc)
for (cnv in cnvs) {
summ = M_wide %>%
lm(
glue('{bc} ~ Gender + race_b + age10 + smoke + {cnv}'),
data = .
) %>%
sjPlot::get_model_data(type = 'est') %>%
mutate(bc = bc)
summs = rbind(summs, summ)
}
}
options(repr.plot.width = 6, repr.plot.height = 1.8, repr.plot.res = 300)
limit = 5
D = summs %>%
filter(term %in% cnvs) %>%
rowwise() %>%
mutate(estimate = max(min(estimate, limit), -limit)) %>%
ungroup() %>%
mutate(q = p.adjust(p.value, method = 'BH')) %>%
mutate(signif_cat = case_when(
q < 0.01 ~ 'q<0.01',
q < 0.05 ~ 'q<0.05',
q < 0.1 ~ 'q<0.1',
T ~ 'ns'
)) %>%
mutate(
bc = str_remove(bc, '_n'),
term = str_remove_all(term, '_|chr'),
term = str_replace(str_replace(str_replace(term, 'del', '-'), 'amp', '+'), 'loh', '='),
chrom = as.integer(str_extract(term, '\\d+'))
) %>%
arrange(chrom) %>%
mutate(term = factor(term, unique(term)))
ggplot(
D,
aes(x = term, y = bc, fill = estimate)
) +
theme_classic() +
geom_tile(width = 0.9, height = 0.9) +
geom_point(
data = D %>% filter(q < 0.1),
aes(pch = signif_cat)
# aes(size = -log10(p.value))
) +
scale_shape_manual(values = c('q<0.01' = 8, 'q<0.05' = 3, 'q<0.1' = 20), name = 'FDR') +
scale_fill_gradient2(low = 'lightblue', high = 'red', mid = 'white', na.value = 'white', midpoint = 0, limits = c(-limit, limit)) +
theme(plot.margin = unit(c(0,0,0,0), 'mm'), legend.key.size = unit(0.5, 'cm')) +
theme(
axis.text.x = element_text(angle = 30, hjust = 0.5, size = 8),
axis.text.y = element_text(size = 8),
legend.key.size = unit(2.5, 'mm'),
legend.text = element_text(size = 8),
legend.title = element_text(size = 8),
legend.spacing = unit(1, 'mm')
) +
xlab('') +
ylab('')
combs = list(c('JAK2', 'chr9_p_loh'), c('EZH2', 'chr7_loh'),
c('TET2', 'chr4_del', 'chr4_loh'),
c('MPL', 'chr1_p_loh'), c('ATM', 'chr11_del', 'chr11_loh'),
c('DNMT3A', 'chr2_del'),
c(''))
summs = data.frame()
for (bc in paste0(cbcs[!cbcs %in% c('rbc', 'wbc')], '_n')) {
display(bc)
for (comb in combs) {
comb = paste(comb, collapse = '+')
summ = M_wide %>%
lm(
glue('{bc} ~ Gender + race_b + age10 + smoke + {comb}'),
data = .
) %>%
sjPlot::get_model_data(type = 'est') %>%
mutate(bc = bc)
summs = rbind(summs, summ)
}
}
options(repr.plot.width = 6, repr.plot.height = 2, repr.plot.res = 300)
limit = 5
D = summs %>%
filter(!str_detect(term, 'Gender|race|smoke|age')) %>%
rowwise() %>%
mutate(estimate = max(min(estimate, limit), -limit)) %>%
ungroup() %>%
mutate(q = p.adjust(p.value, method = 'BH')) %>%
# mutate(q = p.value) %>%
mutate(signif_cat = case_when(
q < 0.01 ~ 'q<0.01',
q < 0.05 ~ 'q<0.05',
q < 0.1 ~ 'q<0.1',
T ~ 'ns'
)) %>%
mutate(term = factor(term, unlist(combs))) %>%
arrange(term) %>%
mutate(
bc = str_remove(bc, '_n'),
term = str_remove_all(term, '_|chr'),
term = str_replace(str_replace(str_replace(term, 'del', '-'), 'amp', '+'), 'loh', '='),
chrom = as.integer(str_extract(term, '\\d+'))
) %>%
mutate(term = factor(term, unique(term)))
ggplot(
D,
aes(x = term, y = bc, fill = estimate)
) +
theme_classic() +
geom_tile(width = 0.9, height = 0.9) +
geom_point(
data = D %>% filter(q < 0.1),
aes(pch = signif_cat)
# aes(size = -log10(p.value))
) +
scale_shape_manual(values = c('q<0.01' = 8, 'q<0.05' = 3, 'q<0.1' = 20), name = 'FDR') +
scale_fill_gradient2(low = 'lightblue', high = 'red', mid = 'white', na.value = 'white', midpoint = 0, limits = c(-limit, limit)) +
theme(plot.margin = unit(c(0,0,0,0), 'mm'), legend.key.size = unit(0.5, 'cm')) +
theme(
axis.text.x = element_text(angle = 30, hjust = 1, size = 8),
axis.text.y = element_text(size = 8),
legend.key.size = unit(2.5, 'mm'),
legend.text = element_text(size = 8),
legend.title = element_text(size = 8),
legend.spacing = unit(1, 'mm')
) +
xlab('') +
ylab('')
p = ggplot(
M_wide %>%
filter(!is.na(heme_cat2)) %>%
group_by(heme_cat2) %>%
mutate(n_heme_cat = n()) %>%
ungroup() %>%
mutate(heme_cat2 = glue('{heme_cat2}({n_heme_cat})')) %>%
mutate(seq_to_diag = seq_to_diag/365),
aes(x = seq_to_diag, fill = heme_cat2)
) +
geom_histogram(bins = 50, color = 'black', size = 0.1) +
scale_y_continuous(expand = c(0,0)) +
xlab('Years since blood draw') +
theme_classic() +
theme(
legend.title = element_blank(),
legend.position = 'top'
) +
ylab('Cases')
do_plot(p, 'diagnosis.pdf', w = 5, h = 2.5)
(M_wide %>% filter(post_heme == 1) %>% pull(seq_to_diag) %>% mean)/30
M_wide %>% filter(post_heme == 1) %>% pull(seq_to_diag) %>% range %>% {./30}
no_cnv = M_wide %>% filter(post_heme == 1 & heme_cat == 'MDS' & (!ch_cnv_auto)) %>% pull(seq_to_diag)
yes_cnv = M_wide %>% filter(post_heme == 1 & heme_cat == 'MDS' & ch_cnv_auto) %>% pull(seq_to_diag)
t.test(no_cnv, yes_cnv)
M_wide %>%
filter(post_heme == 1) %>%
mutate(
heme_cat = ifelse(heme_cat %in% c('ET', 'PV'), 'MPN', heme_cat),
heme_cat = ifelse(heme_cat %in% c('ALCL', 'CTCL'), 'T-NHL', heme_cat),
heme_cat = ifelse(heme_cat %in% c('DLBCL', 'FL', 'MZL', 'PCNSL', 'WM'), 'B-NHL', heme_cat)
) %>%
group_by(heme_cat) %>%
mutate(n_heme_cat = n()) %>%
ungroup() %>%
mutate(seq_to_diag = seq_to_diag/30) %>%
group_by(heme_cat) %>%
summarise(
n = n(),
seq_to_diag = mean(seq_to_diag)
)
M_wide %>%
filter(post_heme == 1) %>%
filter(heme_cat == 'MDS') %>%
group_by(ch_cnv_auto) %>%
summarise(round(median(seq_to_diag)/30, 0))
gene_set = fread('./data/gene_set.tsv')$Gene
gene_set = c('DNMT3A', 'TET2', 'JAK2', 'ASXL1', 'TP53', 'ATM', 'PPM1D', 'CHEK2',
'SRSF2', 'SF3B1', 'U2AF1', 'EZH2', 'MPL', 'NOTCH1', 'TERT', 'SUZ12', 'CBL')
patients_heme = M_wide %>% filter(post_heme == 1) %>% pull(dmp_patient_id)
segs_auto = segs_filtered %>% filter(chrom != '23')
M_co = rbind(
M_long %>%
filter(Gene %in% gene_set) %>%
filter(ch_nonsilent == 1 | VariantClass == 'Addition') %>%
mutate(
mutation = Gene,
type = VariantClass2,
arm = 'na',
focality = 'na'
) %>%
rowwise() %>% mutate(phi = min(VAF_N * 2, 1)) %>% ungroup() %>%
select(mutation, phi, type, arm, focality, dmp_patient_id),
segs_auto %>%
mutate(type_label = c('loh' = '=', 'del' = '-', 'amp' = '+')[type]) %>%
mutate(mutation = paste0(chrom, ifelse(arm == 'p,q', '', arm), type_label)) %>%
select(mutation, phi, type = type2, arm, focality, dmp_patient_id)
) %>%
filter(dmp_patient_id %in% patients_heme) %>%
full_join(
M_wide %>% filter(post_heme == 1) %>%
select(dmp_patient_id, heme_cat, seq_to_diag),
by = 'dmp_patient_id'
) %>%
filter(heme_cat != 'LCH') %>%
mutate(
heme_cat = ifelse(heme_cat %in% c('ET', 'PV'), 'MPN', heme_cat),
heme_cat = ifelse(heme_cat %in% c('ALCL', 'CTCL'), 'T-NHL', heme_cat),
heme_cat = ifelse(heme_cat %in% c('DLBCL', 'FL', 'MZL', 'PCNSL', 'WM'), 'B-NHL', heme_cat)
) %>%
mutate(heme_cat = factor(heme_cat, c('MDS', 'MPN', 'AML', 'CML', 'CLL', 'B-NHL', 'T-NHL', 'MM'))) %>%
mutate(mutation = ifelse(is.na(mutation), 'none', mutation))
M_co = M_co %>% mutate(
class = case_when(
mutation %in% c('JAK2', 'MPL', 'CHEK2', '9ploh') ~ 'MPN',
mutation %in% c('TP53') ~ 'TP53',
mutation %in% c('EZH2', 'SRSF2', 'U2AF1', 'SF3B1', 'ASXL1', 'SUZ12') ~ 'MDS',
mutation %in% c('PPM1D', '7qdel') ~ 'CMML',
mutation %in% c('NOTCH1', '12amp', '19amp', 'ATM', 'MYD88') ~ 'CLL',
T ~ 'other'
),
class_rank = as.integer(factor(class, c('MPN', 'TP53', 'MDS', 'CMML', 'CLL', 'other')))
)
mutation_order = c('DNMT3A', 'TET2', 'ASXL1', 'JAK2', 'MPL', 'CHEK2',
'SRSF2', 'SF3B1', 'U2AF1', 'EZH2', 'SUZ12', 'PPM1D', 'NOTCH1','ATM',
'MYD88', 'TERT', 'TP53')
mutation_other = M_co %>% filter(!mutation %in% mutation_order) %>% pull(mutation) %>% unique
mutation_order = c(mutation_order, mutation_other)
D = M_co %>%
# filter(type == 'snv') %>%
distinct(mutation, dmp_patient_id, `.keep_all` = T) %>%
group_by(mutation) %>% mutate(mutation_n = n()) %>% ungroup() %>%
arrange(class_rank, mutation != 'EZH2', desc(mutation_n)) %>%
mutate(mutation = factor(mutation, rev(unique(mutation)))) %>%
rowwise() %>%
mutate(mut_weight = 2^which(mutation == levels(mutation))) %>%
ungroup() %>%
group_by(dmp_patient_id) %>%
mutate(patient_weight = sum(mut_weight)) %>%
ungroup() %>%
group_by(dmp_patient_id) %>%
mutate(patient_group = min(class_rank)) %>%
ungroup() %>%
arrange(heme_cat, seq_to_diag, desc(patient_weight), type) %>%
mutate(pid = as.integer(factor(dmp_patient_id, unique(dmp_patient_id)))) %>%
mutate(mutation = as.character(mutation)) %>%
mutate(mutation = ifelse(mutation == 'none', NA, mutation)) %>%
mutate(mutation = factor(mutation, rev(mutation_order))) %>%
mutate(mutation_i = as.integer(mutation))
p_mat = ggplot(
D,
aes(x = pid, y = mutation, fill = type)
) +
geom_tile(color = 'white', size = 0.5) +
geom_vline(
aes(xintercept = pid - 0.5), color = 'gray95', size = 0.3
) +
# geom_hline(
# yintercept = length(levels(D$mutation)) - 1 - c(2.5), color = 'gray80',
# ) +
theme_classic() +
theme(
axis.text.x = element_blank(),
axis.title.x = element_blank(),
plot.margin = unit(c(0,0,0,0), 'mm'),
legend.key.size = unit(2, 'mm')
) +
geom_vline(xintercept = cumsum(
D %>% distinct(pid, heme_cat) %>% count(heme_cat) %>% pull(n)
) + 0.5, size = 0.2, linetype = 'dashed') +
scale_fill_manual(
values = c(cnv_pal2, sm_pal),
na.translate = FALSE
) +
scale_color_manual(
values = c(cnv_pal2, sm_pal),
na.translate = FALSE
) +
scale_x_discrete(expand = expansion(add = 0)) +
scale_y_discrete(expand = c(0.01,0), na.translate = FALSE) +
xlab('') +
scale_alpha_continuous(range = c(0.4,1)) +
labs(alpha = 'Cell fraction') +
labs(fill = 'Mutation') +
ylab('')
p_heme = D %>%
mutate(seq_to_diag = seq_to_diag/365) %>%
distinct(dmp_patient_id, pid, heme_cat, seq_to_diag) %>%
group_by(heme_cat) %>%
mutate(label_pos = median(pid)) %>%
ungroup %>%
ggplot(
aes(x = pid, y = 1, fill = heme_cat, alpha = seq_to_diag)
) +
geom_tile(color = 'white', size = 0.5, show.legend = T) +
# geom_vline(xintercept = cumsum(
# D %>% distinct(pid, heme_cat) %>% count(heme_cat) %>% pull(n)
# ) + 0.5, size = 0.2, linetype = 'dashed') +
geom_rect(xmin = 0, xmax = max(D$pid) + 0.5, ymin = 1.5, ymax = 3, fill = 'white', show.legend = F) +
geom_hline(yintercept = 0.5, size = 1) +
geom_text(
aes(x = label_pos, y = 2, label = heme_cat),
color = 'black',
vjust = 1,
size = 2,
show.legend = F
) +
theme_void() +
theme(
axis.text.x = element_blank(),
legend.key.size = unit(2, 'mm'),
plot.margin = unit(c(0,0,0,0), 'mm'),
legend.margin = margin(l = 1.7,1,1,1, "mm")
) +
scale_x_discrete(expand = expansion(add = 0)) +
scale_y_discrete(expand = expansion(add = 0)) +
ylab('') +
scale_fill_manual(
values = c(' ' = 'white', 'CLL' = 'royalblue', 'MDS' = 'tomato3', 'CML' = 'yellow',
'AML' = 'red', 'MPN' = 'purple', 'B-NHL' = 'yellowgreen', 'T-NHL' = 'orchid', 'MM' = 'orange')
) +
scale_alpha_continuous(trans = 'reverse') +
labs(alpha = "Years to \ndiagnosis") +
guides(fill = FALSE)
panel = (p_heme / p_mat) + plot_layout(heights = c(1,20), guides = 'collect')
do_plot(panel, 'heme_heatmap.pdf', w = 7.7, h = 5)
M_wide %>% filter(heme_cat2 == 'MPN' & chr9) %>% pull(seq_to_diag)
betas = data.frame()
for (landmark in seq(90, 360, 90)) {
D = M_wide %>%
filter(time_leukemia > landmark) %>%
mutate(time_leukemia = time_leukemia - landmark) %>%
rowwise() %>%
mutate(
# VAF_pd_10 = min(VAF_pd, 0.5)/0.1,
VAF_pd_10 = VAF_pd/0.1
) %>%
ungroup()
fit = coxph(Surv(time_leukemia, post_leukemia) ~ ch_cnv_auto + VAF_pd_10 + mutnum_pd + age, data = D, na.action = 'na.omit')
cox.sum = sjPlot::get_model_data(fit, type = 'est') %>% mutate(with_cnv = 0, landmark = landmark)
betas = rbind(
betas,
cox.sum
)
}
p = betas %>%
filter(!(term %in% c('GenderMale', 'age'))) %>%
mutate(landmark = paste0(landmark, ' days')) %>%
mutate(landmark = factor(landmark, unique(landmark))) %>%
mutate(term = c('VAF_pd_10' = 'PD VAF', 'mutnum_pd' = 'PD Mutation Number', 'ch_cnv_auto' = 'mCA')[as.character(term)]) %>%
mutate(term = factor(term, unique(term))) %>%
plot_forest(
x = 'term',
label = 'p.stars',
eb_w = 0,
eb_s = 1,
ps = 2,
or_s = 3,
nudge = 0.1,
col = 'landmark'
) +
theme_classic() +
ylab('HR') +
xlab('') +
scale_color_discrete(name = 'Landmark')
do_plot(p, 'landmark.pdf', w = 4.5, h = 3)
options(repr.plot.width = 4, repr.plot.height = 2, repr.plot.res = 300)
p = betas %>%
filter(landmark == 270) %>%
filter(!(term %in% c('GenderMale', 'age'))) %>%
mutate(landmark = paste0(landmark, ' days')) %>%
mutate(landmark = factor(landmark, unique(landmark))) %>%
mutate(term = c('VAF_pd_10' = 'PD VAF', 'mutnum_pd' = 'PD Mutation Number', 'ch_cnv_auto' = 'mCA')[as.character(term)]) %>%
mutate(term = factor(term, unique(term))) %>%
plot_forest(
x = 'term',
label = 'p.stars',
eb_w = 0,
eb_s = 1,
ps = 2,
or_s = 3,
nudge = 0.1
) +
theme_classic() +
ylab('HR') +
xlab('')
do_plot(p, 'cox.pdf', w = 4, h = 1.8)
betas %>%
filter(landmark == 270)
# data frame for Sean
M_wide %>%
select(dmp_patient_id, ch_cnv_auto, ch_nonsilent, DateofProcedure, date_death,
date_lastfu, post_leukemia, post_mn, seq_to_diag) %>%
#fwrite('./data/survival_sean.tsv', sep = '\t')
library(prodlim)
library(cmprsk)
data001 = M_wide %>% mutate(seq_to_diag = ifelse(post_leukemia, seq_to_diag, NA))
OSDays <- as.numeric(as.Date(data001$date_lastfu , "%Y-%m-%d") - as.Date(data001$DateofProcedure , "%Y-%m-%d"))
Dead <- 1*(!is.na(as.Date(data001$date_death , "%Y-%m-%d")))
TimetoAnalysis <- data001$seq_to_diag
TimetoAnalysis[is.na(TimetoAnalysis)] <- OSDays[is.na(TimetoAnalysis)]
YearstoAnalysis <- TimetoAnalysis/365
data001$YearstoAnalysis <- YearstoAnalysis
EventAnalysis <- 2*Dead
EventAnalysis[!is.na(data001$seq_to_diag)] <- 1
data001$EventAnalysis <- EventAnalysis
CH <- rep("None", length(data001$ch_nonsilent))
CH[data001$ch_pd ==0 & data001$ch_cnv_auto==1] <- "mCA ONLY"
CH[data001$ch_pd ==1 & data001$ch_cnv_auto==0] <- "GM ONLY"
CH[data001$ch_pd ==1 & data001$ch_cnv_auto==1] <- "mCA+GM"
data001$CH <- CH
data001 = data001 %>% mutate(CH = factor(CH, c('None', 'GM ONLY', 'mCA ONLY', 'mCA+GM')))
data002 <- data001[data001$YearstoAnalysis >0 , ] ## need to remove the <= 0 times
dim(data002)
cmv <- prodlim(Hist(YearstoAnalysis, EventAnalysis) ~CH, data=data002)
summary(cmv,3) ### if the lower confidence interval =0 in the output just put at <0.001
par(mar = c(3,2,1,1))
pdf(file = './figures/cum_inc.pdf', width = 6, height = 6)
plot(cmv,
xlim=c(0, 4), ylim=c(0,0.25),
xlab = "Years from blooddraw", ylab = "Cumulative Incidence",
atRisk.at = seq(0, 4, 1),
axis1.at = seq(0, 4, 1),
atrisk.labels = "",
legend.cex = 1,
col = c('#abdda4', '#2b83ba','#fdae61', '#d7191c'),
legend.x="topleft")
title(main="", cex=3)
dev.off()
display_pdf(file = './figures/cum_inc.pdf')
## P-value
print(cuminc(YearstoAnalysis, EventAnalysis, CH))
## P-value is <0.001 [corresponding test statistics is 174.7842, under 3 d.f.s
segs_filtered %>%
filter(post_anyblood == 1) %>%
mutate(heme_cat = ifelse(heme_cat %in% c('ET', 'PV'), 'MPN', heme_cat)) %>%
reshape2::dcast(heme_cat ~ chrom + type, fun.aggregate = function(x){length(unique(x))}, value.var = 'dmp_patient_id')
segs_filtered %>%
filter(post_anyblood == 1) %>%
mutate(heme_cat = ifelse(heme_cat %in% c('ET', 'PV'), 'MPN', heme_cat)) %>%
reshape2::dcast(heme_cat ~ ., fun.aggregate = function(x){length(unique(x))}, value.var = 'dmp_patient_id')
M_wide %>%
mutate(heme_cat = ifelse(heme_cat %in% c('ET', 'PV'), 'MPN', heme_cat)) %>%
filter(post_anyblood == 1) %>%
group_by(heme_cat) %>%
summarise(
total = n(),
n_ai = sum(ch_cnv_auto),
n_sm = sum(ch_nonsilent),
n_ai_only = sum(ch_cnv_auto & (!ch_nonsilent))
)
M_wide %>%
filter(!is.na(heme_cat2)) %>%
summarise(
total = n(),
n_ai = sum(ch_cnv_auto),
prop_ai = n_ai/total,
n_sm = sum(ch_nonsilent),
n_sm_only = sum(ch_nonsilent & (!ch_cnv_auto)),
prop_sm_only = n_sm_only/total,
n_ai_only = sum(ch_cnv_auto & (!ch_nonsilent)),
prop_ai_only = n_ai_only/total
)
M_wide %>%
filter(!is.na(heme_cat2)) %>%
group_by(heme_cat2) %>%
summarise(
total = n(),
n_ai = sum(ch_cnv_auto),
n_sm = sum(ch_nonsilent),
n_sm_only = sum(ch_nonsilent & (!ch_cnv_auto)),
n_ai_only = sum(ch_cnv_auto & (!ch_nonsilent))
)
M_wide %>% count(ch_cnv)
M_wide %>% filter((JAK2) & chr9_p_loh) %>% filter(post_heme == 1) %>% pull(VAF_JAK2)
colnames(M_wide)
options(repr.plot.width = 8, repr.plot.height = 5, repr.plot.res = 300)
fit = M_wide %>%
survfit(formula = Surv(time_lastfu, death_status) ~ ch_cnv_auto, data = .)
size_coef = 0.3
p = ggsurvplot(
fit,
# Defining plotting parameters
data = M_wide,
size = 3 * size_coef, # Change line size
fontsize = 10 * size_coef, # Size of the table's labels
palette = 'Dark2',
ggtheme = theme_classic(), # Change ggplot2 theme
tables.theme = clean_theme(), # Change table's theme
# Adding useful info to the plot
# conf.int = TRUE, # Add confidence interval
pval = TRUE, # Add p-value
pval.size = 12 * size_coef,
risk.table = TRUE, # Add risk table
# Adding labels and title
# title = paste0(g," survival curves"),
# legend.labs = c("Normal", "CH-SM", "CH-CNV"), # Change legend labels
legend.title = "",
legend = "right",
xlab = "Time in days", # Customize X axis label
ylab = "tMN free proportion", # Customize Y axis label
# Defining risk table plotting parameters
risk.table.col = "strata", # Risk table color by groups
risk.table.height = 0.25, # Useful to change when you have multiple groups
risk.table.y.text = FALSE # Displaying colors instead of name in risk table
)
do_plot(p, "R1_overall_survival.pdf", w = 5, h = 5)